Add multimodal support (llama.cpp)

2025-06-08 14:46:14 -04:00 · 2025-05-28 05:52:07 -07:00 · 2025-05-28 05:52:07 -07:00 · f92e1f44a0
commit f92e1f44a0
parent 6c3590ba9a
4 changed files with 117 additions and 36 deletions
--- a/css/main.css
+++ b/css/main.css
@ -1550,3 +1550,16 @@ strong {
    color: var(--body-text-color-subdued);
    margin-top: 4px;
 }
 .image-attachment {
    flex-direction: column;
 }
 .attachment-image {
    border-radius: 16px;
    margin-bottom: 5px;
    object-fit: cover;
    object-position: center;
    border: 2px solid var(--border-color-primary);
    aspect-ratio: 1 / 1;
 }
--- a/modules/chat.py
+++ b/modules/chat.py
@ -220,13 +220,22 @@ def generate_chat_prompt(user_input, state, **kwargs):
            # Add attachment content if present
            if user_key in metadata and "attachments" in metadata[user_key]:
                attachments_text = ""
                image_refs = ""
                for attachment in metadata[user_key]["attachments"]:
                    if attachment.get("type") == "image":
                        # Add image reference for multimodal models
                        image_refs += f"[img-{attachment['image_id']}]"
                    else:
                        # Handle text/PDF attachments as before
                        filename = attachment.get("name", "file")
                        content = attachment.get("content", "")
                        attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
                if image_refs or attachments_text:
                    enhanced_user_msg = f"{image_refs}{user_msg}"
                    if attachments_text:
-                    enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}"
+                        enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}"
            messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg})
@ -240,22 +249,29 @@ def generate_chat_prompt(user_input, state, **kwargs):
        has_attachments = user_key in metadata and "attachments" in metadata[user_key]
    if (user_input or has_attachments) and not impersonate and not _continue:
        # For the current user input being processed, check if we need to add attachments
        if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
        current_row_idx = len(history)
        user_key = f"user_{current_row_idx}"
        enhanced_user_input = user_input
        if user_key in metadata and "attachments" in metadata[user_key]:
            attachments_text = ""
            image_refs = ""
            for attachment in metadata[user_key]["attachments"]:
                if attachment.get("type") == "image":
                    image_refs += f"[img-{attachment['image_id']}]"
                else:
                    filename = attachment.get("name", "file")
                    content = attachment.get("content", "")
                    attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
            if image_refs or attachments_text:
                enhanced_user_input = f"{image_refs}{user_input}"
                if attachments_text:
-                    user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}"
+                    enhanced_user_input += f"\n\nATTACHMENTS:\n{attachments_text}"
-        messages.append({"role": "user", "content": user_input})
+        messages.append({"role": "user", "content": enhanced_user_input})
    def make_prompt(messages):
        if state['mode'] == 'chat-instruct' and _continue:
@ -493,26 +509,43 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
    file_extension = path.suffix.lower()
    try:
-        # Handle different file types
+        # Handle image files
-        if file_extension == '.pdf':
+        if file_extension in ['.jpg', '.jpeg', '.png', '.webp', '.bmp', '.gif']:
            # Convert image to base64
            with open(path, 'rb') as f:
                image_data = base64.b64encode(f.read()).decode('utf-8')
            # Generate unique image ID
            image_id = len([att for att in history['metadata'][key]["attachments"] if att.get("type") == "image"]) + 1
            attachment = {
                "name": filename,
                "type": "image",
                "image_data": image_data,
                "image_id": image_id,
                "file_path": str(path)  # For UI preview
            }
        elif file_extension == '.pdf':
            # Process PDF file
            content = extract_pdf_text(path)
-            file_type = "application/pdf"
+            attachment = {
                "name": filename,
                "type": "application/pdf",
                "content": content,
            }
        else:
            # Default handling for text files
            with open(path, 'r', encoding='utf-8') as f:
                content = f.read()
            file_type = "text/plain"
        # Add attachment
            attachment = {
                "name": filename,
-            "type": file_type,
+                "type": "text/plain",
                "content": content,
            }
        history['metadata'][key]["attachments"].append(attachment)
-        return content  # Return the content for reuse
+        return attachment  # Return the attachment for reuse
    except Exception as e:
        logger.error(f"Error processing attachment {filename}: {e}")
        return None
@ -567,6 +600,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
        for file_path in files:
            add_message_attachment(output, row_idx, file_path, is_user=True)
        # Collect image attachments for llama.cpp
        image_attachments = []
        if 'metadata' in output:
            user_key = f"user_{row_idx}"
            if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]:
                for attachment in output['metadata'][user_key]["attachments"]:
                    if attachment.get("type") == "image":
                        image_attachments.append(attachment)
        # Add image attachments to state for the generation
        if image_attachments:
            state['image_attachments'] = image_attachments
        # Add web search results as attachments if enabled
        add_web_search_attachments(output, row_idx, text, state)
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@ -372,7 +372,17 @@ def format_message_attachments(history, role, index):
        for attachment in attachments:
            name = html.escape(attachment["name"])
-            # Make clickable if URL exists
+            if attachment.get("type") == "image":
                # Show image preview
                file_path = attachment.get("file_path", "")
                attachments_html += (
                    f'<div class="attachment-box image-attachment">'
                    f'<img src="file/{file_path}" alt="{name}" class="attachment-image" />'
                    f'<div class="attachment-name">{name}</div>'
                    f'</div>'
                )
            else:
                # Make clickable if URL exists (web search)
                if "url" in attachment:
                    name = f'<a href="{html.escape(attachment["url"])}" target="_blank" rel="noopener noreferrer">{name}</a>'
@ -382,6 +392,7 @@ def format_message_attachments(history, role, index):
                    f'<div class="attachment-name">{name}</div>'
                    f'</div>'
                )
        attachments_html += '</div>'
        return attachments_html
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -140,6 +140,17 @@ class LlamaServer:
            "cache_prompt": True
        })
        # Add image data if present
        if 'image_attachments' in state:
            image_data = []
            for attachment in state['image_attachments']:
                image_data.append({
                    "data": attachment['image_data'],
                    "id": attachment['image_id']
                })
            if image_data:
                payload["image_data"] = image_data
        if shared.args.verbose:
            logger.info("GENERATE_PARAMS=")
            printable_payload = {k: v for k, v in payload.items() if k != "prompt"}