From f92e1f44a0e076f27ecafe942d42ae84ad681c09 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 05:52:07 -0700
Subject: [PATCH 01/10] Add multimodal support (llama.cpp)

---
 css/main.css                |  13 +++++
 modules/chat.py             | 100 ++++++++++++++++++++++++++----------
 modules/html_generator.py   |  29 +++++++----
 modules/llama_cpp_server.py |  11 ++++
 4 files changed, 117 insertions(+), 36 deletions(-)

diff --git a/css/main.css b/css/main.css
index 181a19b8..268ddb74 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1550,3 +1550,16 @@ strong {
     color: var(--body-text-color-subdued);
     margin-top: 4px;
 }
+
+.image-attachment {
+    flex-direction: column;
+}
+
+.attachment-image {
+    border-radius: 16px;
+    margin-bottom: 5px;
+    object-fit: cover;
+    object-position: center;
+    border: 2px solid var(--border-color-primary);
+    aspect-ratio: 1 / 1;
+}
diff --git a/modules/chat.py b/modules/chat.py
index b2aacd5c..1a7556c8 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -220,13 +220,22 @@ def generate_chat_prompt(user_input, state, **kwargs):
             # Add attachment content if present
             if user_key in metadata and "attachments" in metadata[user_key]:
                 attachments_text = ""
-                for attachment in metadata[user_key]["attachments"]:
-                    filename = attachment.get("name", "file")
-                    content = attachment.get("content", "")
-                    attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+                image_refs = ""
 
-                if attachments_text:
-                    enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}"
+                for attachment in metadata[user_key]["attachments"]:
+                    if attachment.get("type") == "image":
+                        # Add image reference for multimodal models
+                        image_refs += f"[img-{attachment['image_id']}]"
+                    else:
+                        # Handle text/PDF attachments as before
+                        filename = attachment.get("name", "file")
+                        content = attachment.get("content", "")
+                        attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+
+                if image_refs or attachments_text:
+                    enhanced_user_msg = f"{image_refs}{user_msg}"
+                    if attachments_text:
+                        enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}"
 
             messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg})
 
@@ -240,22 +249,29 @@ def generate_chat_prompt(user_input, state, **kwargs):
         has_attachments = user_key in metadata and "attachments" in metadata[user_key]
 
     if (user_input or has_attachments) and not impersonate and not _continue:
-        # For the current user input being processed, check if we need to add attachments
-        if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
-            current_row_idx = len(history)
-            user_key = f"user_{current_row_idx}"
+        current_row_idx = len(history)
+        user_key = f"user_{current_row_idx}"
 
-            if user_key in metadata and "attachments" in metadata[user_key]:
-                attachments_text = ""
-                for attachment in metadata[user_key]["attachments"]:
+        enhanced_user_input = user_input
+
+        if user_key in metadata and "attachments" in metadata[user_key]:
+            attachments_text = ""
+            image_refs = ""
+
+            for attachment in metadata[user_key]["attachments"]:
+                if attachment.get("type") == "image":
+                    image_refs += f"[img-{attachment['image_id']}]"
+                else:
                     filename = attachment.get("name", "file")
                     content = attachment.get("content", "")
                     attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
 
+            if image_refs or attachments_text:
+                enhanced_user_input = f"{image_refs}{user_input}"
                 if attachments_text:
-                    user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}"
+                    enhanced_user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
 
-        messages.append({"role": "user", "content": user_input})
+        messages.append({"role": "user", "content": enhanced_user_input})
 
     def make_prompt(messages):
         if state['mode'] == 'chat-instruct' and _continue:
@@ -493,26 +509,43 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
     file_extension = path.suffix.lower()
 
     try:
-        # Handle different file types
-        if file_extension == '.pdf':
+        # Handle image files
+        if file_extension in ['.jpg', '.jpeg', '.png', '.webp', '.bmp', '.gif']:
+            # Convert image to base64
+            with open(path, 'rb') as f:
+                image_data = base64.b64encode(f.read()).decode('utf-8')
+
+            # Generate unique image ID
+            image_id = len([att for att in history['metadata'][key]["attachments"] if att.get("type") == "image"]) + 1
+
+            attachment = {
+                "name": filename,
+                "type": "image",
+                "image_data": image_data,
+                "image_id": image_id,
+                "file_path": str(path)  # For UI preview
+            }
+
+        elif file_extension == '.pdf':
             # Process PDF file
             content = extract_pdf_text(path)
-            file_type = "application/pdf"
+            attachment = {
+                "name": filename,
+                "type": "application/pdf",
+                "content": content,
+            }
         else:
             # Default handling for text files
             with open(path, 'r', encoding='utf-8') as f:
                 content = f.read()
-            file_type = "text/plain"
-
-        # Add attachment
-        attachment = {
-            "name": filename,
-            "type": file_type,
-            "content": content,
-        }
+            attachment = {
+                "name": filename,
+                "type": "text/plain",
+                "content": content,
+            }
 
         history['metadata'][key]["attachments"].append(attachment)
-        return content  # Return the content for reuse
+        return attachment  # Return the attachment for reuse
     except Exception as e:
         logger.error(f"Error processing attachment {filename}: {e}")
         return None
@@ -567,6 +600,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         for file_path in files:
             add_message_attachment(output, row_idx, file_path, is_user=True)
 
+        # Collect image attachments for llama.cpp
+        image_attachments = []
+        if 'metadata' in output:
+            user_key = f"user_{row_idx}"
+            if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]:
+                for attachment in output['metadata'][user_key]["attachments"]:
+                    if attachment.get("type") == "image":
+                        image_attachments.append(attachment)
+
+        # Add image attachments to state for the generation
+        if image_attachments:
+            state['image_attachments'] = image_attachments
+
         # Add web search results as attachments if enabled
         add_web_search_attachments(output, row_idx, text, state)
 
diff --git a/modules/html_generator.py b/modules/html_generator.py
index bfb278cd..aa037314 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -372,16 +372,27 @@ def format_message_attachments(history, role, index):
         for attachment in attachments:
             name = html.escape(attachment["name"])
 
-            # Make clickable if URL exists
-            if "url" in attachment:
-                name = f'<a href="{html.escape(attachment["url"])}" target="_blank" rel="noopener noreferrer">{name}</a>'
+            if attachment.get("type") == "image":
+                # Show image preview
+                file_path = attachment.get("file_path", "")
+                attachments_html += (
+                    f'<div class="attachment-box image-attachment">'
+                    f'<img src="file/{file_path}" alt="{name}" class="attachment-image" />'
+                    f'<div class="attachment-name">{name}</div>'
+                    f'</div>'
+                )
+            else:
+                # Make clickable if URL exists (web search)
+                if "url" in attachment:
+                    name = f'<a href="{html.escape(attachment["url"])}" target="_blank" rel="noopener noreferrer">{name}</a>'
+
+                attachments_html += (
+                    f'<div class="attachment-box">'
+                    f'<div class="attachment-icon">{attachment_svg}</div>'
+                    f'<div class="attachment-name">{name}</div>'
+                    f'</div>'
+                )
 
-            attachments_html += (
-                f'<div class="attachment-box">'
-                f'<div class="attachment-icon">{attachment_svg}</div>'
-                f'<div class="attachment-name">{name}</div>'
-                f'</div>'
-            )
         attachments_html += '</div>'
         return attachments_html
 
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index d695c74e..36411105 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -140,6 +140,17 @@ class LlamaServer:
             "cache_prompt": True
         })
 
+        # Add image data if present
+        if 'image_attachments' in state:
+            image_data = []
+            for attachment in state['image_attachments']:
+                image_data.append({
+                    "data": attachment['image_data'],
+                    "id": attachment['image_id']
+                })
+            if image_data:
+                payload["image_data"] = image_data
+
         if shared.args.verbose:
             logger.info("GENERATE_PARAMS=")
             printable_payload = {k: v for k, v in payload.items() if k != "prompt"}

From 2e21b1f5e345702f5fa9075aa0697cefe077b72f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 09:14:26 -0700
Subject: [PATCH 02/10] Integrate with the API

---
 extensions/openai/completions.py | 129 ++++++++++++++++++++++++++++++-
 1 file changed, 125 insertions(+), 4 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 5181b18b..17f5dc9b 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -1,8 +1,11 @@
+import base64
 import copy
 import json
 import time
 from collections import deque
+from datetime import datetime
 
+import requests
 import tiktoken
 from pydantic import ValidationError
 
@@ -16,6 +19,7 @@ from modules.chat import (
     load_character_memoized,
     load_instruction_template_memoized
 )
+from modules.logging_colors import logger
 from modules.presets import load_preset_memoized
 from modules.text_generation import decode, encode, generate_reply
 
@@ -82,6 +86,67 @@ def process_parameters(body, is_legacy=False):
     return generate_params
 
 
+def get_current_timestamp():
+    """Returns the current time in 24-hour format"""
+    return datetime.now().strftime('%b %d, %Y %H:%M')
+
+
+def process_image_url(url, image_id):
+    """Process an image URL and return attachment data"""
+    try:
+        if url.startswith("data:"):
+            # Handle data URL (data:image/jpeg;base64,...)
+            if "base64," in url:
+                image_data = url.split("base64,", 1)[1]
+            else:
+                raise ValueError("Unsupported data URL format")
+        else:
+            # Handle regular URL - download image
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()
+            image_data = base64.b64encode(response.content).decode('utf-8')
+
+        return {
+            "name": f"image_{image_id}",
+            "type": "image",
+            "image_data": image_data,
+            "image_id": image_id,
+            "file_path": f"api_image_{image_id}",  # Add this for consistency with UI
+        }
+    except Exception as e:
+        logger.error(f"Error processing image URL {url}: {e}")
+        return None
+
+
+def process_multimodal_content(content):
+    """Process multimodal content and return text content and attachments"""
+    if isinstance(content, str):
+        return content, []
+
+    if isinstance(content, list):
+        text_content = ""
+        image_refs = ""
+        attachments = []
+
+        for item in content:
+            if item.get("type") == "text":
+                text_content += item.get("text", "")
+            elif item.get("type") == "image_url":
+                image_url = item.get("image_url", {}).get("url", "")
+                if image_url:
+                    attachment = process_image_url(image_url, len(attachments) + 1)
+                    if attachment:
+                        attachments.append(attachment)
+                        image_refs += f"[img-{attachment['image_id']}]"
+                    else:
+                        # Log warning but continue processing
+                        logger.warning(f"Failed to process image URL: {image_url}")
+
+        return f"{image_refs}{text_content}", attachments
+
+    return str(content), []
+
+
 def convert_history(history):
     '''
     Chat histories in this program are in the format [message, reply].
@@ -93,26 +158,46 @@ def convert_history(history):
     user_input = ""
     user_input_last = True
     system_message = ""
+    metadata = {}
+
+    # Keep track of attachments for the current message being built
+    pending_attachments = []
 
     for entry in history:
         content = entry["content"]
         role = entry["role"]
 
         if role == "user":
-            user_input = content
+            # Process multimodal content
+            processed_content, attachments = process_multimodal_content(content)
+            user_input = processed_content
             user_input_last = True
+
             if current_message:
                 chat_dialogue.append([current_message, '', ''])
                 current_message = ""
 
-            current_message = content
+            current_message = processed_content
+            pending_attachments = attachments  # Store attachments for when message is added
+
         elif role == "assistant":
             if "tool_calls" in entry and isinstance(entry["tool_calls"], list) and len(entry["tool_calls"]) > 0 and content.strip() == "":
                 continue  # skip tool calls
             current_reply = content
             user_input_last = False
             if current_message:
+                row_idx = len(chat_dialogue)  # Calculate index here, right before adding
                 chat_dialogue.append([current_message, current_reply, ''])
+
+                # Add attachments to metadata if any
+                if pending_attachments:
+                    user_key = f"user_{row_idx}"
+                    metadata[user_key] = {
+                        "timestamp": get_current_timestamp(),
+                        "attachments": pending_attachments
+                    }
+                    pending_attachments = []  # Clear pending attachments
+
                 current_message = ""
                 current_reply = ""
             else:
@@ -123,10 +208,19 @@ def convert_history(history):
         elif role == "system":
             system_message += f"\n{content}" if system_message else content
 
+    # Handle case where there's a pending user message at the end
+    if current_message and pending_attachments:
+        row_idx = len(chat_dialogue)  # This will be the index when the message is processed
+        user_key = f"user_{row_idx}"
+        metadata[user_key] = {
+            "timestamp": get_current_timestamp(),
+            "attachments": pending_attachments
+        }
+
     if not user_input_last:
         user_input = ""
 
-    return user_input, system_message, {'internal': chat_dialogue, 'visible': copy.deepcopy(chat_dialogue)}
+    return user_input, system_message, {'internal': chat_dialogue, 'visible': copy.deepcopy(chat_dialogue), 'metadata': metadata}
 
 
 def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, prompt_only=False) -> dict:
@@ -150,9 +244,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         elif m['role'] == 'function':
             raise InvalidRequestError(message="role: function is not supported.", param='messages')
 
-        if 'content' not in m and "image_url" not in m:
+        # Handle multimodal content validation
+        content = m.get('content')
+        if content is None:
             raise InvalidRequestError(message="messages: missing content", param='messages')
 
+        # Validate multimodal content structure
+        if isinstance(content, list):
+            for item in content:
+                if not isinstance(item, dict) or 'type' not in item:
+                    raise InvalidRequestError(message="messages: invalid content item format", param='messages')
+                if item['type'] not in ['text', 'image_url']:
+                    raise InvalidRequestError(message="messages: unsupported content type", param='messages')
+                if item['type'] == 'text' and 'text' not in item:
+                    raise InvalidRequestError(message="messages: missing text in content item", param='messages')
+                if item['type'] == 'image_url' and ('image_url' not in item or 'url' not in item['image_url']):
+                    raise InvalidRequestError(message="messages: missing image_url in content item", param='messages')
+
     # Chat Completions
     object_type = 'chat.completion' if not stream else 'chat.completion.chunk'
     created_time = int(time.time())
@@ -189,6 +297,15 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     # History
     user_input, custom_system_message, history = convert_history(messages)
 
+    # Collect image attachments for multimodal support
+    image_attachments = []
+    if 'metadata' in history:
+        for key, value in history['metadata'].items():
+            if 'attachments' in value:
+                for attachment in value['attachments']:
+                    if attachment.get('type') == 'image':
+                        image_attachments.append(attachment)
+
     generate_params.update({
         'mode': body['mode'],
         'name1': name1,
@@ -205,6 +322,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         'stream': stream
     })
 
+    # Add image attachments to state for llama.cpp multimodal support
+    if image_attachments:
+        generate_params['image_attachments'] = image_attachments
+
     max_tokens = generate_params['max_new_tokens']
     if max_tokens in [None, 0]:
         generate_params['max_new_tokens'] = 512

From c1a47a0b606428f77ea8d70d2572fe6d5310d72e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 09:17:02 -0700
Subject: [PATCH 03/10] Better request header

---
 extensions/openai/completions.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 17f5dc9b..939bd9c3 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -102,7 +102,10 @@ def process_image_url(url, image_id):
                 raise ValueError("Unsupported data URL format")
         else:
             # Handle regular URL - download image
-            response = requests.get(url, timeout=10)
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+            }
+            response = requests.get(url, timeout=10, headers=headers)
             response.raise_for_status()
             image_data = base64.b64encode(response.content).decode('utf-8')
 
@@ -111,7 +114,7 @@ def process_image_url(url, image_id):
             "type": "image",
             "image_data": image_data,
             "image_id": image_id,
-            "file_path": f"api_image_{image_id}",  # Add this for consistency with UI
+            "file_path": f"api_image_{image_id}",
         }
     except Exception as e:
         logger.error(f"Error processing image URL {url}: {e}")

From c6d0de8538b4ff76f4a0f1dd9dfd57dd19772a23 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 09:28:20 -0700
Subject: [PATCH 04/10] Better image positioning in prompts

---
 extensions/openai/completions.py | 3 +--
 modules/chat.py                  | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 939bd9c3..98cfba05 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -140,12 +140,11 @@ def process_multimodal_content(content):
                     attachment = process_image_url(image_url, len(attachments) + 1)
                     if attachment:
                         attachments.append(attachment)
-                        image_refs += f"[img-{attachment['image_id']}]"
                     else:
                         # Log warning but continue processing
                         logger.warning(f"Failed to process image URL: {image_url}")
 
-        return f"{image_refs}{text_content}", attachments
+        return text_content, attachments
 
     return str(content), []
 
diff --git a/modules/chat.py b/modules/chat.py
index 1a7556c8..55e7866c 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -233,7 +233,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
                         attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
 
                 if image_refs or attachments_text:
-                    enhanced_user_msg = f"{image_refs}{user_msg}"
+                    enhanced_user_msg = f"{user_msg} {image_refs}"
                     if attachments_text:
                         enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}"
 
@@ -267,7 +267,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
                     attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
 
             if image_refs or attachments_text:
-                enhanced_user_input = f"{image_refs}{user_input}"
+                enhanced_user_input = f"{user_input} {image_refs}"
                 if attachments_text:
                     enhanced_user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
 

From 9d7894a13f0651d54c9fd016fa7d87fc02b40195 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 10:10:26 -0700
Subject: [PATCH 05/10] Organize

---
 modules/llama_cpp_server.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 36411105..ee63262e 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -121,6 +121,18 @@ class LlamaServer:
             to_ban = [[int(token_id), False] for token_id in state['custom_token_bans'].split(',')]
             payload["logit_bias"] = to_ban
 
+        # Add image data if present
+        if 'image_attachments' in state:
+            image_data = []
+            for attachment in state['image_attachments']:
+                image_data.append({
+                    "data": attachment['image_data'],
+                    "id": attachment['image_id']
+                })
+
+            if image_data:
+                payload["image_data"] = image_data
+
         return payload
 
     def generate_with_streaming(self, prompt, state):
@@ -140,20 +152,9 @@ class LlamaServer:
             "cache_prompt": True
         })
 
-        # Add image data if present
-        if 'image_attachments' in state:
-            image_data = []
-            for attachment in state['image_attachments']:
-                image_data.append({
-                    "data": attachment['image_data'],
-                    "id": attachment['image_id']
-                })
-            if image_data:
-                payload["image_data"] = image_data
-
         if shared.args.verbose:
             logger.info("GENERATE_PARAMS=")
-            printable_payload = {k: v for k, v in payload.items() if k != "prompt"}
+            printable_payload = {k: v for k, v in payload.items() if k not in ["prompt", "image_data"]}
             pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
             print()
 

From d702a2a9623d77ae260a2f37f9f83c763837d44b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 11:51:05 -0700
Subject: [PATCH 06/10] Lint

---
 extensions/openai/completions.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 98cfba05..5cc261b3 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -128,7 +128,6 @@ def process_multimodal_content(content):
 
     if isinstance(content, list):
         text_content = ""
-        image_refs = ""
         attachments = []
 
         for item in content:

From 1f3b1a1b9459bcafcdd683b74d4a050001a04785 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 12:14:51 -0700
Subject: [PATCH 07/10] Simplify things

---
 extensions/openai/completions.py | 91 +++++++++-----------------------
 1 file changed, 24 insertions(+), 67 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 5cc261b3..2374733a 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -3,7 +3,6 @@ import copy
 import json
 import time
 from collections import deque
-from datetime import datetime
 
 import requests
 import tiktoken
@@ -86,49 +85,34 @@ def process_parameters(body, is_legacy=False):
     return generate_params
 
 
-def get_current_timestamp():
-    """Returns the current time in 24-hour format"""
-    return datetime.now().strftime('%b %d, %Y %H:%M')
-
-
 def process_image_url(url, image_id):
-    """Process an image URL and return attachment data"""
+    """Process an image URL and return attachment data for llama.cpp"""
     try:
         if url.startswith("data:"):
-            # Handle data URL (data:image/jpeg;base64,...)
             if "base64," in url:
                 image_data = url.split("base64,", 1)[1]
             else:
                 raise ValueError("Unsupported data URL format")
         else:
-            # Handle regular URL - download image
-            headers = {
-                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
-            }
+            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
             response = requests.get(url, timeout=10, headers=headers)
             response.raise_for_status()
             image_data = base64.b64encode(response.content).decode('utf-8')
 
-        return {
-            "name": f"image_{image_id}",
-            "type": "image",
-            "image_data": image_data,
-            "image_id": image_id,
-            "file_path": f"api_image_{image_id}",
-        }
+        return {"image_data": image_data, "image_id": image_id}
     except Exception as e:
         logger.error(f"Error processing image URL {url}: {e}")
         return None
 
 
 def process_multimodal_content(content):
-    """Process multimodal content and return text content and attachments"""
+    """Extract text and images from OpenAI multimodal format"""
     if isinstance(content, str):
         return content, []
 
     if isinstance(content, list):
         text_content = ""
-        attachments = []
+        images = []
 
         for item in content:
             if item.get("type") == "text":
@@ -136,14 +120,11 @@ def process_multimodal_content(content):
             elif item.get("type") == "image_url":
                 image_url = item.get("image_url", {}).get("url", "")
                 if image_url:
-                    attachment = process_image_url(image_url, len(attachments) + 1)
-                    if attachment:
-                        attachments.append(attachment)
-                    else:
-                        # Log warning but continue processing
-                        logger.warning(f"Failed to process image URL: {image_url}")
+                    image = process_image_url(image_url, len(images) + 1)
+                    if image:
+                        images.append(image)
 
-        return text_content, attachments
+        return text_content, images
 
     return str(content), []
 
@@ -159,10 +140,7 @@ def convert_history(history):
     user_input = ""
     user_input_last = True
     system_message = ""
-    metadata = {}
-
-    # Keep track of attachments for the current message being built
-    pending_attachments = []
+    all_images = []  # Simple list to collect all images
 
     for entry in history:
         content = entry["content"]
@@ -170,16 +148,20 @@ def convert_history(history):
 
         if role == "user":
             # Process multimodal content
-            processed_content, attachments = process_multimodal_content(content)
+            processed_content, images = process_multimodal_content(content)
+            if images:
+                image_refs = "".join(f"[img-{img['image_id']}]" for img in images)
+                processed_content = f"{processed_content} {image_refs}"
+
             user_input = processed_content
             user_input_last = True
+            all_images.extend(images)  # Add any images to our collection
 
             if current_message:
                 chat_dialogue.append([current_message, '', ''])
                 current_message = ""
 
             current_message = processed_content
-            pending_attachments = attachments  # Store attachments for when message is added
 
         elif role == "assistant":
             if "tool_calls" in entry and isinstance(entry["tool_calls"], list) and len(entry["tool_calls"]) > 0 and content.strip() == "":
@@ -187,18 +169,7 @@ def convert_history(history):
             current_reply = content
             user_input_last = False
             if current_message:
-                row_idx = len(chat_dialogue)  # Calculate index here, right before adding
                 chat_dialogue.append([current_message, current_reply, ''])
-
-                # Add attachments to metadata if any
-                if pending_attachments:
-                    user_key = f"user_{row_idx}"
-                    metadata[user_key] = {
-                        "timestamp": get_current_timestamp(),
-                        "attachments": pending_attachments
-                    }
-                    pending_attachments = []  # Clear pending attachments
-
                 current_message = ""
                 current_reply = ""
             else:
@@ -209,19 +180,14 @@ def convert_history(history):
         elif role == "system":
             system_message += f"\n{content}" if system_message else content
 
-    # Handle case where there's a pending user message at the end
-    if current_message and pending_attachments:
-        row_idx = len(chat_dialogue)  # This will be the index when the message is processed
-        user_key = f"user_{row_idx}"
-        metadata[user_key] = {
-            "timestamp": get_current_timestamp(),
-            "attachments": pending_attachments
-        }
-
     if not user_input_last:
         user_input = ""
 
-    return user_input, system_message, {'internal': chat_dialogue, 'visible': copy.deepcopy(chat_dialogue), 'metadata': metadata}
+    return user_input, system_message, {
+        'internal': chat_dialogue,
+        'visible': copy.deepcopy(chat_dialogue),
+        'images': all_images  # Simple list of all images from the conversation
+    }
 
 
 def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, prompt_only=False) -> dict:
@@ -298,15 +264,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     # History
     user_input, custom_system_message, history = convert_history(messages)
 
-    # Collect image attachments for multimodal support
-    image_attachments = []
-    if 'metadata' in history:
-        for key, value in history['metadata'].items():
-            if 'attachments' in value:
-                for attachment in value['attachments']:
-                    if attachment.get('type') == 'image':
-                        image_attachments.append(attachment)
-
     generate_params.update({
         'mode': body['mode'],
         'name1': name1,
@@ -323,9 +280,9 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         'stream': stream
     })
 
-    # Add image attachments to state for llama.cpp multimodal support
-    if image_attachments:
-        generate_params['image_attachments'] = image_attachments
+    # Add images to state for llama.cpp multimodal support
+    if history.get('images'):
+        generate_params['image_attachments'] = history['images']
 
     max_tokens = generate_params['max_new_tokens']
     if max_tokens in [None, 0]:

From 27affa9db755e53c094e9d5f30fa85c9aa3e68bb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 5 Jun 2025 10:47:07 -0700
Subject: [PATCH 08/10] Pre-merge dev branch

---
 modules/chat.py | 245 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 182 insertions(+), 63 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 55e7866c..275f28f9 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -467,19 +467,21 @@ def get_stopping_strings(state):
     return result
 
 
-def add_message_version(history, row_idx, is_current=True):
-    key = f"assistant_{row_idx}"
+def add_message_version(history, role, row_idx, is_current=True):
+    key = f"{role}_{row_idx}"
+    if 'metadata' not in history:
+        history['metadata'] = {}
     if key not in history['metadata']:
         history['metadata'][key] = {}
 
     if "versions" not in history['metadata'][key]:
         history['metadata'][key]["versions"] = []
 
-    current_content = history['internal'][row_idx][1]
-    current_visible = history['visible'][row_idx][1]
+    # Determine which index to use for content based on role
+    content_idx = 0 if role == 'user' else 1
+    current_content = history['internal'][row_idx][content_idx]
+    current_visible = history['visible'][row_idx][content_idx]
 
-    # Always add the current message as a new version entry.
-    # The timestamp will differentiate it even if content is identical to a previous version.
     history['metadata'][key]["versions"].append({
         "content": current_content,
         "visible_content": current_visible,
@@ -534,6 +536,13 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
                 "type": "application/pdf",
                 "content": content,
             }
+        elif file_extension == '.docx':
+            content = extract_docx_text(path)
+            attachment = {
+                "name": filename,
+                "type": "application/docx",
+                "content": content,
+            }
         else:
             # Default handling for text files
             with open(path, 'r', encoding='utf-8') as f:
@@ -569,6 +578,79 @@ def extract_pdf_text(pdf_path):
         return f"[Error extracting PDF text: {str(e)}]"
 
 
+def extract_docx_text(docx_path):
+    """
+    Extract text from a .docx file, including headers,
+    body (paragraphs and tables), and footers.
+    """
+    try:
+        import docx
+
+        doc = docx.Document(docx_path)
+        parts = []
+
+        # 1) Extract non-empty header paragraphs from each section
+        for section in doc.sections:
+            for para in section.header.paragraphs:
+                text = para.text.strip()
+                if text:
+                    parts.append(text)
+
+        # 2) Extract body blocks (paragraphs and tables) in document order
+        parent_elm = doc.element.body
+        for child in parent_elm.iterchildren():
+            if isinstance(child, docx.oxml.text.paragraph.CT_P):
+                para = docx.text.paragraph.Paragraph(child, doc)
+                text = para.text.strip()
+                if text:
+                    parts.append(text)
+
+            elif isinstance(child, docx.oxml.table.CT_Tbl):
+                table = docx.table.Table(child, doc)
+                for row in table.rows:
+                    cells = [cell.text.strip() for cell in row.cells]
+                    parts.append("\t".join(cells))
+
+        # 3) Extract non-empty footer paragraphs from each section
+        for section in doc.sections:
+            for para in section.footer.paragraphs:
+                text = para.text.strip()
+                if text:
+                    parts.append(text)
+
+        return "\n".join(parts)
+
+    except Exception as e:
+        logger.error(f"Error extracting text from DOCX: {e}")
+        return f"[Error extracting DOCX text: {str(e)}]"
+
+
+def generate_search_query(user_message, state):
+    """Generate a search query from user message using the LLM"""
+    # Augment the user message with search instruction
+    augmented_message = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else."
+
+    # Use a minimal state for search query generation but keep the full history
+    search_state = state.copy()
+    search_state['max_new_tokens'] = 64
+    search_state['auto_max_new_tokens'] = False
+    search_state['enable_thinking'] = False
+
+    # Generate the full prompt using existing history + augmented message
+    formatted_prompt = generate_chat_prompt(augmented_message, search_state)
+
+    query = ""
+    for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True):
+        query = reply
+
+    # Strip and remove surrounding quotes if present
+    query = query.strip()
+    if len(query) >= 2 and query.startswith('"') and query.endswith('"'):
+        query = query[1:-1]
+
+    return query
+
+
 def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False):
     # Handle dict format with text and files
     files = []
@@ -614,7 +696,9 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             state['image_attachments'] = image_attachments
 
         # Add web search results as attachments if enabled
-        add_web_search_attachments(output, row_idx, text, state)
+        if state.get('enable_web_search', False):
+            search_query = generate_search_query(text, state)
+            add_web_search_attachments(output, row_idx, text, search_query, state)
 
         # Apply extensions
         text, visible_text = apply_extensions('chat_input', text, visible_text, state)
@@ -638,9 +722,18 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         if regenerate:
             row_idx = len(output['internal']) - 1
 
-            # Store the first response as a version before regenerating
+            # Store the old response as a version before regenerating
             if not output['metadata'].get(f"assistant_{row_idx}", {}).get('versions'):
-                add_message_version(output, row_idx, is_current=False)
+                add_message_version(output, "assistant", row_idx, is_current=False)
+
+            # Add new empty version (will be filled during streaming)
+            key = f"assistant_{row_idx}"
+            output['metadata'][key]["versions"].append({
+                "content": "",
+                "visible_content": "",
+                "timestamp": get_current_timestamp()
+            })
+            output['metadata'][key]["current_version_index"] = len(output['metadata'][key]["versions"]) - 1
 
             if loading_message:
                 yield {
@@ -672,7 +765,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
     # Add timestamp for assistant's response at the start of generation
     row_idx = len(output['internal']) - 1
-    update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp())
+    update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp(), model_name=shared.model_name)
 
     # Generate
     reply = None
@@ -694,33 +787,51 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         if _continue:
             output['internal'][-1] = [text, last_reply[0] + reply]
             output['visible'][-1] = [visible_text, last_reply[1] + visible_reply]
-            if is_stream:
-                yield output
         elif not (j == 0 and visible_reply.strip() == ''):
             output['internal'][-1] = [text, reply.lstrip(' ')]
             output['visible'][-1] = [visible_text, visible_reply.lstrip(' ')]
-            if is_stream:
-                yield output
 
-    # Add the newly generated response as a version (only for regeneration)
-    if regenerate:
-        row_idx = len(output['internal']) - 1
-        add_message_version(output, row_idx, is_current=True)
+        # Keep version metadata in sync during streaming (for regeneration)
+        if regenerate:
+            row_idx = len(output['internal']) - 1
+            key = f"assistant_{row_idx}"
+            current_idx = output['metadata'][key]['current_version_index']
+            output['metadata'][key]['versions'][current_idx].update({
+                'content': output['internal'][row_idx][1],
+                'visible_content': output['visible'][row_idx][1]
+            })
+
+        if is_stream:
+            yield output
 
     output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
+
+    # Final sync for version metadata (in case streaming was disabled)
+    if regenerate:
+        row_idx = len(output['internal']) - 1
+        key = f"assistant_{row_idx}"
+        current_idx = output['metadata'][key]['current_version_index']
+        output['metadata'][key]['versions'][current_idx].update({
+            'content': output['internal'][row_idx][1],
+            'visible_content': output['visible'][row_idx][1]
+        })
+
     yield output
 
 
-def impersonate_wrapper(text, state):
+def impersonate_wrapper(textbox, state):
+    text = textbox['text']
     static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
     prompt = generate_chat_prompt('', state, impersonate=True)
     stopping_strings = get_stopping_strings(state)
 
-    yield text + '...', static_output
+    textbox['text'] = text + '...'
+    yield textbox, static_output
     reply = None
     for reply in generate_reply(prompt + text, state, stopping_strings=stopping_strings, is_chat=True):
-        yield (text + reply).lstrip(' '), static_output
+        textbox['text'] = (text + reply).lstrip(' ')
+        yield textbox, static_output
         if shared.stop_everything:
             return
 
@@ -769,7 +880,9 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
     last_save_time = time.monotonic()
     save_interval = 8
     for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)):
-        yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
+        yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'], last_message_only=(i > 0)), history
+        if i == 0:
+            time.sleep(0.125)  # We need this to make sure the first update goes through
 
         current_time = time.monotonic()
         # Save on first iteration or if save_interval seconds have passed
@@ -800,9 +913,12 @@ def remove_last_message(history):
     return html.unescape(last[0]), history
 
 
-def send_dummy_message(textbox, state):
+def send_dummy_message(text, state):
     history = state['history']
-    text = textbox['text']
+
+    # Handle both dict and string inputs
+    if isinstance(text, dict):
+        text = text['text']
 
     # Initialize metadata if not present
     if 'metadata' not in history:
@@ -816,9 +932,12 @@ def send_dummy_message(textbox, state):
     return history
 
 
-def send_dummy_reply(textbox, state):
+def send_dummy_reply(text, state):
     history = state['history']
-    text = textbox['text']
+
+    # Handle both dict and string inputs
+    if isinstance(text, dict):
+        text = text['text']
 
     # Initialize metadata if not present
     if 'metadata' not in history:
@@ -1487,76 +1606,76 @@ def handle_edit_message_click(state):
 
     if message_index >= len(history['internal']):
         html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
-        return [history, html_output, gr.update()]
+        return [history, html_output]
 
-    # Use the role passed from frontend
-    is_user_msg = (role == "user")
-    role_idx = 0 if is_user_msg else 1
+    role_idx = 0 if role == "user" else 1
 
-    # For assistant messages, save the original version BEFORE updating content
-    if not is_user_msg:
-        if not history['metadata'].get(f"assistant_{message_index}", {}).get('versions'):
-            add_message_version(history, message_index, is_current=False)
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
+    key = f"{role}_{message_index}"
+    if key not in history['metadata']:
+        history['metadata'][key] = {}
+
+    # If no versions exist yet for this message, store the current (pre-edit) content as the first version.
+    if "versions" not in history['metadata'][key] or not history['metadata'][key]["versions"]:
+        original_content = history['internal'][message_index][role_idx]
+        original_visible = history['visible'][message_index][role_idx]
+        original_timestamp = history['metadata'][key].get('timestamp', get_current_timestamp())
+
+        history['metadata'][key]["versions"] = [{
+            "content": original_content,
+            "visible_content": original_visible,
+            "timestamp": original_timestamp
+        }]
 
-    # NOW update the message content
     history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True)
     history['visible'][message_index][role_idx] = html.escape(new_text)
 
-    # Branch if editing user message, add version if editing assistant message
-    if is_user_msg:
-        # Branch like branch-here
-        history['visible'] = history['visible'][:message_index + 1]
-        history['internal'] = history['internal'][:message_index + 1]
-        new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
-        save_history(history, new_unique_id, state['character_menu'], state['mode'])
-        histories = find_all_histories_with_first_prompts(state)
-        past_chats_update = gr.update(choices=histories, value=new_unique_id)
-        state['unique_id'] = new_unique_id
-    elif not is_user_msg:
-        # Add the new version as current
-        add_message_version(history, message_index, is_current=True)
-        past_chats_update = gr.update()
-    else:
-        past_chats_update = gr.update()
+    add_message_version(history, role, message_index, is_current=True)
 
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
     html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
-    return [history, html_output, past_chats_update]
+    return [history, html_output]
 
 
 def handle_navigate_version_click(state):
     history = state['history']
     message_index = int(state['navigate_message_index'])
     direction = state['navigate_direction']
+    role = state['navigate_message_role']
 
-    # Get assistant message metadata
-    key = f"assistant_{message_index}"
-    if key not in history['metadata'] or 'versions' not in history['metadata'][key]:
-        # No versions to navigate
+    if not role:
+        logger.error("Role not provided for version navigation.")
+        html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+        return [history, html]
+
+    key = f"{role}_{message_index}"
+    if 'metadata' not in history or key not in history['metadata'] or 'versions' not in history['metadata'][key]:
         html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
         return [history, html]
 
     metadata = history['metadata'][key]
-    current_idx = metadata.get('current_version_index', 0)
     versions = metadata['versions']
+    # Default to the last version if current_version_index is not set
+    current_idx = metadata.get('current_version_index', len(versions) - 1 if versions else 0)
 
-    # Calculate new index
     if direction == 'left':
         new_idx = max(0, current_idx - 1)
     else:  # right
         new_idx = min(len(versions) - 1, current_idx + 1)
 
     if new_idx == current_idx:
-        # No change needed
         html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
         return [history, html]
 
-    # Update history with new version
-    version = versions[new_idx]
-    history['internal'][message_index][1] = version['content']
-    history['visible'][message_index][1] = version['visible_content']
+    msg_content_idx = 0 if role == 'user' else 1  # 0 for user content, 1 for assistant content in the pair
+    version_to_load = versions[new_idx]
+    history['internal'][message_index][msg_content_idx] = version_to_load['content']
+    history['visible'][message_index][msg_content_idx] = version_to_load['visible_content']
     metadata['current_version_index'] = new_idx
+    update_message_metadata(history['metadata'], role, message_index, timestamp=version_to_load['timestamp'])
 
     # Redraw and save
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

From 7366ff5dfa760f84cd0ff69c460888791466fd95 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 5 Jun 2025 10:49:02 -0700
Subject: [PATCH 09/10] Change a class name

---
 css/main.css              | 2 +-
 modules/html_generator.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/css/main.css b/css/main.css
index 268ddb74..10089b1d 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1555,7 +1555,7 @@ strong {
     flex-direction: column;
 }
 
-.attachment-image {
+.image-preview {
     border-radius: 16px;
     margin-bottom: 5px;
     object-fit: cover;
diff --git a/modules/html_generator.py b/modules/html_generator.py
index aa037314..44e0b236 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -377,7 +377,7 @@ def format_message_attachments(history, role, index):
                 file_path = attachment.get("file_path", "")
                 attachments_html += (
                     f'<div class="attachment-box image-attachment">'
-                    f'<img src="file/{file_path}" alt="{name}" class="attachment-image" />'
+                    f'<img src="file/{file_path}" alt="{name}" class="image-preview" />'
                     f'<div class="attachment-name">{name}</div>'
                     f'</div>'
                 )

From 0783f5c891badc96146d654d1f2bcbdd99f433af Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 5 Jun 2025 11:42:12 -0700
Subject: [PATCH 10/10] Use the latest format

---
 extensions/openai/completions.py |  2 +-
 modules/chat.py                  | 21 +++++++++++++++++----
 modules/llama_cpp_server.py      | 12 ++++++------
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 2374733a..4e4e310f 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -150,7 +150,7 @@ def convert_history(history):
             # Process multimodal content
             processed_content, images = process_multimodal_content(content)
             if images:
-                image_refs = "".join(f"[img-{img['image_id']}]" for img in images)
+                image_refs = "".join("<__media__>" for img in images)
                 processed_content = f"{processed_content} {image_refs}"
 
             user_input = processed_content
diff --git a/modules/chat.py b/modules/chat.py
index 275f28f9..9dc8d1fd 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -225,7 +225,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
                 for attachment in metadata[user_key]["attachments"]:
                     if attachment.get("type") == "image":
                         # Add image reference for multimodal models
-                        image_refs += f"[img-{attachment['image_id']}]"
+                        image_refs += "<__media__>"
                     else:
                         # Handle text/PDF attachments as before
                         filename = attachment.get("name", "file")
@@ -260,7 +260,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
             for attachment in metadata[user_key]["attachments"]:
                 if attachment.get("type") == "image":
-                    image_refs += f"[img-{attachment['image_id']}]"
+                    image_refs += "<__media__>"
                 else:
                     filename = attachment.get("name", "file")
                     content = attachment.get("content", "")
@@ -517,17 +517,30 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
             with open(path, 'rb') as f:
                 image_data = base64.b64encode(f.read()).decode('utf-8')
 
+            # Determine MIME type from extension
+            mime_type_map = {
+                '.jpg': 'image/jpeg',
+                '.jpeg': 'image/jpeg',
+                '.png': 'image/png',
+                '.webp': 'image/webp',
+                '.bmp': 'image/bmp',
+                '.gif': 'image/gif'
+            }
+            mime_type = mime_type_map.get(file_extension, 'image/jpeg')
+
+            # Format as data URL
+            data_url = f"data:{mime_type};base64,{image_data}"
+
             # Generate unique image ID
             image_id = len([att for att in history['metadata'][key]["attachments"] if att.get("type") == "image"]) + 1
 
             attachment = {
                 "name": filename,
                 "type": "image",
-                "image_data": image_data,
+                "image_data": data_url,
                 "image_id": image_id,
                 "file_path": str(path)  # For UI preview
             }
-
         elif file_extension == '.pdf':
             # Process PDF file
             content = extract_pdf_text(path)
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 8b4ed7a7..ca1b2c47 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -123,15 +123,15 @@ class LlamaServer:
 
         # Add image data if present
         if 'image_attachments' in state:
-            image_data = []
+            medias = []
             for attachment in state['image_attachments']:
-                image_data.append({
-                    "data": attachment['image_data'],
-                    "id": attachment['image_id']
+                medias.append({
+                    "type": "image",
+                    "data": attachment['image_data']
                 })
 
-            if image_data:
-                payload["image_data"] = image_data
+            if medias:
+                payload["medias"] = medias
 
         return payload