From f92e1f44a0e076f27ecafe942d42ae84ad681c09 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 05:52:07 -0700 Subject: [PATCH 01/10] Add multimodal support (llama.cpp) --- css/main.css | 13 +++++ modules/chat.py | 100 ++++++++++++++++++++++++++---------- modules/html_generator.py | 29 +++++++---- modules/llama_cpp_server.py | 11 ++++ 4 files changed, 117 insertions(+), 36 deletions(-) diff --git a/css/main.css b/css/main.css index 181a19b8..268ddb74 100644 --- a/css/main.css +++ b/css/main.css @@ -1550,3 +1550,16 @@ strong { color: var(--body-text-color-subdued); margin-top: 4px; } + +.image-attachment { + flex-direction: column; +} + +.attachment-image { + border-radius: 16px; + margin-bottom: 5px; + object-fit: cover; + object-position: center; + border: 2px solid var(--border-color-primary); + aspect-ratio: 1 / 1; +} diff --git a/modules/chat.py b/modules/chat.py index b2aacd5c..1a7556c8 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -220,13 +220,22 @@ def generate_chat_prompt(user_input, state, **kwargs): # Add attachment content if present if user_key in metadata and "attachments" in metadata[user_key]: attachments_text = "" - for attachment in metadata[user_key]["attachments"]: - filename = attachment.get("name", "file") - content = attachment.get("content", "") - attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + image_refs = "" - if attachments_text: - enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}" + for attachment in metadata[user_key]["attachments"]: + if attachment.get("type") == "image": + # Add image reference for multimodal models + image_refs += f"[img-{attachment['image_id']}]" + else: + # Handle text/PDF attachments as before + filename = attachment.get("name", "file") + content = attachment.get("content", "") + attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + + if image_refs or attachments_text: + enhanced_user_msg = f"{image_refs}{user_msg}" + if attachments_text: + enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}" messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg}) @@ -240,22 +249,29 @@ def generate_chat_prompt(user_input, state, **kwargs): has_attachments = user_key in metadata and "attachments" in metadata[user_key] if (user_input or has_attachments) and not impersonate and not _continue: - # For the current user input being processed, check if we need to add attachments - if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0: - current_row_idx = len(history) - user_key = f"user_{current_row_idx}" + current_row_idx = len(history) + user_key = f"user_{current_row_idx}" - if user_key in metadata and "attachments" in metadata[user_key]: - attachments_text = "" - for attachment in metadata[user_key]["attachments"]: + enhanced_user_input = user_input + + if user_key in metadata and "attachments" in metadata[user_key]: + attachments_text = "" + image_refs = "" + + for attachment in metadata[user_key]["attachments"]: + if attachment.get("type") == "image": + image_refs += f"[img-{attachment['image_id']}]" + else: filename = attachment.get("name", "file") content = attachment.get("content", "") attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + if image_refs or attachments_text: + enhanced_user_input = f"{image_refs}{user_input}" if attachments_text: - user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}" + enhanced_user_input += f"\n\nATTACHMENTS:\n{attachments_text}" - messages.append({"role": "user", "content": user_input}) + messages.append({"role": "user", "content": enhanced_user_input}) def make_prompt(messages): if state['mode'] == 'chat-instruct' and _continue: @@ -493,26 +509,43 @@ def add_message_attachment(history, row_idx, file_path, is_user=True): file_extension = path.suffix.lower() try: - # Handle different file types - if file_extension == '.pdf': + # Handle image files + if file_extension in ['.jpg', '.jpeg', '.png', '.webp', '.bmp', '.gif']: + # Convert image to base64 + with open(path, 'rb') as f: + image_data = base64.b64encode(f.read()).decode('utf-8') + + # Generate unique image ID + image_id = len([att for att in history['metadata'][key]["attachments"] if att.get("type") == "image"]) + 1 + + attachment = { + "name": filename, + "type": "image", + "image_data": image_data, + "image_id": image_id, + "file_path": str(path) # For UI preview + } + + elif file_extension == '.pdf': # Process PDF file content = extract_pdf_text(path) - file_type = "application/pdf" + attachment = { + "name": filename, + "type": "application/pdf", + "content": content, + } else: # Default handling for text files with open(path, 'r', encoding='utf-8') as f: content = f.read() - file_type = "text/plain" - - # Add attachment - attachment = { - "name": filename, - "type": file_type, - "content": content, - } + attachment = { + "name": filename, + "type": "text/plain", + "content": content, + } history['metadata'][key]["attachments"].append(attachment) - return content # Return the content for reuse + return attachment # Return the attachment for reuse except Exception as e: logger.error(f"Error processing attachment {filename}: {e}") return None @@ -567,6 +600,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess for file_path in files: add_message_attachment(output, row_idx, file_path, is_user=True) + # Collect image attachments for llama.cpp + image_attachments = [] + if 'metadata' in output: + user_key = f"user_{row_idx}" + if user_key in output['metadata'] and "attachments" in output['metadata'][user_key]: + for attachment in output['metadata'][user_key]["attachments"]: + if attachment.get("type") == "image": + image_attachments.append(attachment) + + # Add image attachments to state for the generation + if image_attachments: + state['image_attachments'] = image_attachments + # Add web search results as attachments if enabled add_web_search_attachments(output, row_idx, text, state) diff --git a/modules/html_generator.py b/modules/html_generator.py index bfb278cd..aa037314 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -372,16 +372,27 @@ def format_message_attachments(history, role, index): for attachment in attachments: name = html.escape(attachment["name"]) - # Make clickable if URL exists - if "url" in attachment: - name = f'{name}' + if attachment.get("type") == "image": + # Show image preview + file_path = attachment.get("file_path", "") + attachments_html += ( + f'
' + f'{name}' + f'
{name}
' + f'
' + ) + else: + # Make clickable if URL exists (web search) + if "url" in attachment: + name = f'{name}' + + attachments_html += ( + f'
' + f'
{attachment_svg}
' + f'
{name}
' + f'
' + ) - attachments_html += ( - f'
' - f'
{attachment_svg}
' - f'
{name}
' - f'
' - ) attachments_html += '' return attachments_html diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index d695c74e..36411105 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -140,6 +140,17 @@ class LlamaServer: "cache_prompt": True }) + # Add image data if present + if 'image_attachments' in state: + image_data = [] + for attachment in state['image_attachments']: + image_data.append({ + "data": attachment['image_data'], + "id": attachment['image_id'] + }) + if image_data: + payload["image_data"] = image_data + if shared.args.verbose: logger.info("GENERATE_PARAMS=") printable_payload = {k: v for k, v in payload.items() if k != "prompt"} From 2e21b1f5e345702f5fa9075aa0697cefe077b72f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 09:14:26 -0700 Subject: [PATCH 02/10] Integrate with the API --- extensions/openai/completions.py | 129 ++++++++++++++++++++++++++++++- 1 file changed, 125 insertions(+), 4 deletions(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 5181b18b..17f5dc9b 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -1,8 +1,11 @@ +import base64 import copy import json import time from collections import deque +from datetime import datetime +import requests import tiktoken from pydantic import ValidationError @@ -16,6 +19,7 @@ from modules.chat import ( load_character_memoized, load_instruction_template_memoized ) +from modules.logging_colors import logger from modules.presets import load_preset_memoized from modules.text_generation import decode, encode, generate_reply @@ -82,6 +86,67 @@ def process_parameters(body, is_legacy=False): return generate_params +def get_current_timestamp(): + """Returns the current time in 24-hour format""" + return datetime.now().strftime('%b %d, %Y %H:%M') + + +def process_image_url(url, image_id): + """Process an image URL and return attachment data""" + try: + if url.startswith("data:"): + # Handle data URL (data:image/jpeg;base64,...) + if "base64," in url: + image_data = url.split("base64,", 1)[1] + else: + raise ValueError("Unsupported data URL format") + else: + # Handle regular URL - download image + response = requests.get(url, timeout=10) + response.raise_for_status() + image_data = base64.b64encode(response.content).decode('utf-8') + + return { + "name": f"image_{image_id}", + "type": "image", + "image_data": image_data, + "image_id": image_id, + "file_path": f"api_image_{image_id}", # Add this for consistency with UI + } + except Exception as e: + logger.error(f"Error processing image URL {url}: {e}") + return None + + +def process_multimodal_content(content): + """Process multimodal content and return text content and attachments""" + if isinstance(content, str): + return content, [] + + if isinstance(content, list): + text_content = "" + image_refs = "" + attachments = [] + + for item in content: + if item.get("type") == "text": + text_content += item.get("text", "") + elif item.get("type") == "image_url": + image_url = item.get("image_url", {}).get("url", "") + if image_url: + attachment = process_image_url(image_url, len(attachments) + 1) + if attachment: + attachments.append(attachment) + image_refs += f"[img-{attachment['image_id']}]" + else: + # Log warning but continue processing + logger.warning(f"Failed to process image URL: {image_url}") + + return f"{image_refs}{text_content}", attachments + + return str(content), [] + + def convert_history(history): ''' Chat histories in this program are in the format [message, reply]. @@ -93,26 +158,46 @@ def convert_history(history): user_input = "" user_input_last = True system_message = "" + metadata = {} + + # Keep track of attachments for the current message being built + pending_attachments = [] for entry in history: content = entry["content"] role = entry["role"] if role == "user": - user_input = content + # Process multimodal content + processed_content, attachments = process_multimodal_content(content) + user_input = processed_content user_input_last = True + if current_message: chat_dialogue.append([current_message, '', '']) current_message = "" - current_message = content + current_message = processed_content + pending_attachments = attachments # Store attachments for when message is added + elif role == "assistant": if "tool_calls" in entry and isinstance(entry["tool_calls"], list) and len(entry["tool_calls"]) > 0 and content.strip() == "": continue # skip tool calls current_reply = content user_input_last = False if current_message: + row_idx = len(chat_dialogue) # Calculate index here, right before adding chat_dialogue.append([current_message, current_reply, '']) + + # Add attachments to metadata if any + if pending_attachments: + user_key = f"user_{row_idx}" + metadata[user_key] = { + "timestamp": get_current_timestamp(), + "attachments": pending_attachments + } + pending_attachments = [] # Clear pending attachments + current_message = "" current_reply = "" else: @@ -123,10 +208,19 @@ def convert_history(history): elif role == "system": system_message += f"\n{content}" if system_message else content + # Handle case where there's a pending user message at the end + if current_message and pending_attachments: + row_idx = len(chat_dialogue) # This will be the index when the message is processed + user_key = f"user_{row_idx}" + metadata[user_key] = { + "timestamp": get_current_timestamp(), + "attachments": pending_attachments + } + if not user_input_last: user_input = "" - return user_input, system_message, {'internal': chat_dialogue, 'visible': copy.deepcopy(chat_dialogue)} + return user_input, system_message, {'internal': chat_dialogue, 'visible': copy.deepcopy(chat_dialogue), 'metadata': metadata} def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, prompt_only=False) -> dict: @@ -150,9 +244,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p elif m['role'] == 'function': raise InvalidRequestError(message="role: function is not supported.", param='messages') - if 'content' not in m and "image_url" not in m: + # Handle multimodal content validation + content = m.get('content') + if content is None: raise InvalidRequestError(message="messages: missing content", param='messages') + # Validate multimodal content structure + if isinstance(content, list): + for item in content: + if not isinstance(item, dict) or 'type' not in item: + raise InvalidRequestError(message="messages: invalid content item format", param='messages') + if item['type'] not in ['text', 'image_url']: + raise InvalidRequestError(message="messages: unsupported content type", param='messages') + if item['type'] == 'text' and 'text' not in item: + raise InvalidRequestError(message="messages: missing text in content item", param='messages') + if item['type'] == 'image_url' and ('image_url' not in item or 'url' not in item['image_url']): + raise InvalidRequestError(message="messages: missing image_url in content item", param='messages') + # Chat Completions object_type = 'chat.completion' if not stream else 'chat.completion.chunk' created_time = int(time.time()) @@ -189,6 +297,15 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p # History user_input, custom_system_message, history = convert_history(messages) + # Collect image attachments for multimodal support + image_attachments = [] + if 'metadata' in history: + for key, value in history['metadata'].items(): + if 'attachments' in value: + for attachment in value['attachments']: + if attachment.get('type') == 'image': + image_attachments.append(attachment) + generate_params.update({ 'mode': body['mode'], 'name1': name1, @@ -205,6 +322,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p 'stream': stream }) + # Add image attachments to state for llama.cpp multimodal support + if image_attachments: + generate_params['image_attachments'] = image_attachments + max_tokens = generate_params['max_new_tokens'] if max_tokens in [None, 0]: generate_params['max_new_tokens'] = 512 From c1a47a0b606428f77ea8d70d2572fe6d5310d72e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 09:17:02 -0700 Subject: [PATCH 03/10] Better request header --- extensions/openai/completions.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 17f5dc9b..939bd9c3 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -102,7 +102,10 @@ def process_image_url(url, image_id): raise ValueError("Unsupported data URL format") else: # Handle regular URL - download image - response = requests.get(url, timeout=10) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + } + response = requests.get(url, timeout=10, headers=headers) response.raise_for_status() image_data = base64.b64encode(response.content).decode('utf-8') @@ -111,7 +114,7 @@ def process_image_url(url, image_id): "type": "image", "image_data": image_data, "image_id": image_id, - "file_path": f"api_image_{image_id}", # Add this for consistency with UI + "file_path": f"api_image_{image_id}", } except Exception as e: logger.error(f"Error processing image URL {url}: {e}") From c6d0de8538b4ff76f4a0f1dd9dfd57dd19772a23 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 09:28:20 -0700 Subject: [PATCH 04/10] Better image positioning in prompts --- extensions/openai/completions.py | 3 +-- modules/chat.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 939bd9c3..98cfba05 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -140,12 +140,11 @@ def process_multimodal_content(content): attachment = process_image_url(image_url, len(attachments) + 1) if attachment: attachments.append(attachment) - image_refs += f"[img-{attachment['image_id']}]" else: # Log warning but continue processing logger.warning(f"Failed to process image URL: {image_url}") - return f"{image_refs}{text_content}", attachments + return text_content, attachments return str(content), [] diff --git a/modules/chat.py b/modules/chat.py index 1a7556c8..55e7866c 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -233,7 +233,7 @@ def generate_chat_prompt(user_input, state, **kwargs): attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" if image_refs or attachments_text: - enhanced_user_msg = f"{image_refs}{user_msg}" + enhanced_user_msg = f"{user_msg} {image_refs}" if attachments_text: enhanced_user_msg += f"\n\nATTACHMENTS:\n{attachments_text}" @@ -267,7 +267,7 @@ def generate_chat_prompt(user_input, state, **kwargs): attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" if image_refs or attachments_text: - enhanced_user_input = f"{image_refs}{user_input}" + enhanced_user_input = f"{user_input} {image_refs}" if attachments_text: enhanced_user_input += f"\n\nATTACHMENTS:\n{attachments_text}" From 9d7894a13f0651d54c9fd016fa7d87fc02b40195 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 10:10:26 -0700 Subject: [PATCH 05/10] Organize --- modules/llama_cpp_server.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 36411105..ee63262e 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -121,6 +121,18 @@ class LlamaServer: to_ban = [[int(token_id), False] for token_id in state['custom_token_bans'].split(',')] payload["logit_bias"] = to_ban + # Add image data if present + if 'image_attachments' in state: + image_data = [] + for attachment in state['image_attachments']: + image_data.append({ + "data": attachment['image_data'], + "id": attachment['image_id'] + }) + + if image_data: + payload["image_data"] = image_data + return payload def generate_with_streaming(self, prompt, state): @@ -140,20 +152,9 @@ class LlamaServer: "cache_prompt": True }) - # Add image data if present - if 'image_attachments' in state: - image_data = [] - for attachment in state['image_attachments']: - image_data.append({ - "data": attachment['image_data'], - "id": attachment['image_id'] - }) - if image_data: - payload["image_data"] = image_data - if shared.args.verbose: logger.info("GENERATE_PARAMS=") - printable_payload = {k: v for k, v in payload.items() if k != "prompt"} + printable_payload = {k: v for k, v in payload.items() if k not in ["prompt", "image_data"]} pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) print() From d702a2a9623d77ae260a2f37f9f83c763837d44b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 11:51:05 -0700 Subject: [PATCH 06/10] Lint --- extensions/openai/completions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 98cfba05..5cc261b3 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -128,7 +128,6 @@ def process_multimodal_content(content): if isinstance(content, list): text_content = "" - image_refs = "" attachments = [] for item in content: From 1f3b1a1b9459bcafcdd683b74d4a050001a04785 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 12:14:51 -0700 Subject: [PATCH 07/10] Simplify things --- extensions/openai/completions.py | 91 +++++++++----------------------- 1 file changed, 24 insertions(+), 67 deletions(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 5cc261b3..2374733a 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -3,7 +3,6 @@ import copy import json import time from collections import deque -from datetime import datetime import requests import tiktoken @@ -86,49 +85,34 @@ def process_parameters(body, is_legacy=False): return generate_params -def get_current_timestamp(): - """Returns the current time in 24-hour format""" - return datetime.now().strftime('%b %d, %Y %H:%M') - - def process_image_url(url, image_id): - """Process an image URL and return attachment data""" + """Process an image URL and return attachment data for llama.cpp""" try: if url.startswith("data:"): - # Handle data URL (data:image/jpeg;base64,...) if "base64," in url: image_data = url.split("base64,", 1)[1] else: raise ValueError("Unsupported data URL format") else: - # Handle regular URL - download image - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' - } + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'} response = requests.get(url, timeout=10, headers=headers) response.raise_for_status() image_data = base64.b64encode(response.content).decode('utf-8') - return { - "name": f"image_{image_id}", - "type": "image", - "image_data": image_data, - "image_id": image_id, - "file_path": f"api_image_{image_id}", - } + return {"image_data": image_data, "image_id": image_id} except Exception as e: logger.error(f"Error processing image URL {url}: {e}") return None def process_multimodal_content(content): - """Process multimodal content and return text content and attachments""" + """Extract text and images from OpenAI multimodal format""" if isinstance(content, str): return content, [] if isinstance(content, list): text_content = "" - attachments = [] + images = [] for item in content: if item.get("type") == "text": @@ -136,14 +120,11 @@ def process_multimodal_content(content): elif item.get("type") == "image_url": image_url = item.get("image_url", {}).get("url", "") if image_url: - attachment = process_image_url(image_url, len(attachments) + 1) - if attachment: - attachments.append(attachment) - else: - # Log warning but continue processing - logger.warning(f"Failed to process image URL: {image_url}") + image = process_image_url(image_url, len(images) + 1) + if image: + images.append(image) - return text_content, attachments + return text_content, images return str(content), [] @@ -159,10 +140,7 @@ def convert_history(history): user_input = "" user_input_last = True system_message = "" - metadata = {} - - # Keep track of attachments for the current message being built - pending_attachments = [] + all_images = [] # Simple list to collect all images for entry in history: content = entry["content"] @@ -170,16 +148,20 @@ def convert_history(history): if role == "user": # Process multimodal content - processed_content, attachments = process_multimodal_content(content) + processed_content, images = process_multimodal_content(content) + if images: + image_refs = "".join(f"[img-{img['image_id']}]" for img in images) + processed_content = f"{processed_content} {image_refs}" + user_input = processed_content user_input_last = True + all_images.extend(images) # Add any images to our collection if current_message: chat_dialogue.append([current_message, '', '']) current_message = "" current_message = processed_content - pending_attachments = attachments # Store attachments for when message is added elif role == "assistant": if "tool_calls" in entry and isinstance(entry["tool_calls"], list) and len(entry["tool_calls"]) > 0 and content.strip() == "": @@ -187,18 +169,7 @@ def convert_history(history): current_reply = content user_input_last = False if current_message: - row_idx = len(chat_dialogue) # Calculate index here, right before adding chat_dialogue.append([current_message, current_reply, '']) - - # Add attachments to metadata if any - if pending_attachments: - user_key = f"user_{row_idx}" - metadata[user_key] = { - "timestamp": get_current_timestamp(), - "attachments": pending_attachments - } - pending_attachments = [] # Clear pending attachments - current_message = "" current_reply = "" else: @@ -209,19 +180,14 @@ def convert_history(history): elif role == "system": system_message += f"\n{content}" if system_message else content - # Handle case where there's a pending user message at the end - if current_message and pending_attachments: - row_idx = len(chat_dialogue) # This will be the index when the message is processed - user_key = f"user_{row_idx}" - metadata[user_key] = { - "timestamp": get_current_timestamp(), - "attachments": pending_attachments - } - if not user_input_last: user_input = "" - return user_input, system_message, {'internal': chat_dialogue, 'visible': copy.deepcopy(chat_dialogue), 'metadata': metadata} + return user_input, system_message, { + 'internal': chat_dialogue, + 'visible': copy.deepcopy(chat_dialogue), + 'images': all_images # Simple list of all images from the conversation + } def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, prompt_only=False) -> dict: @@ -298,15 +264,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p # History user_input, custom_system_message, history = convert_history(messages) - # Collect image attachments for multimodal support - image_attachments = [] - if 'metadata' in history: - for key, value in history['metadata'].items(): - if 'attachments' in value: - for attachment in value['attachments']: - if attachment.get('type') == 'image': - image_attachments.append(attachment) - generate_params.update({ 'mode': body['mode'], 'name1': name1, @@ -323,9 +280,9 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p 'stream': stream }) - # Add image attachments to state for llama.cpp multimodal support - if image_attachments: - generate_params['image_attachments'] = image_attachments + # Add images to state for llama.cpp multimodal support + if history.get('images'): + generate_params['image_attachments'] = history['images'] max_tokens = generate_params['max_new_tokens'] if max_tokens in [None, 0]: From 27affa9db755e53c094e9d5f30fa85c9aa3e68bb Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 5 Jun 2025 10:47:07 -0700 Subject: [PATCH 08/10] Pre-merge dev branch --- modules/chat.py | 245 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 182 insertions(+), 63 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 55e7866c..275f28f9 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -467,19 +467,21 @@ def get_stopping_strings(state): return result -def add_message_version(history, row_idx, is_current=True): - key = f"assistant_{row_idx}" +def add_message_version(history, role, row_idx, is_current=True): + key = f"{role}_{row_idx}" + if 'metadata' not in history: + history['metadata'] = {} if key not in history['metadata']: history['metadata'][key] = {} if "versions" not in history['metadata'][key]: history['metadata'][key]["versions"] = [] - current_content = history['internal'][row_idx][1] - current_visible = history['visible'][row_idx][1] + # Determine which index to use for content based on role + content_idx = 0 if role == 'user' else 1 + current_content = history['internal'][row_idx][content_idx] + current_visible = history['visible'][row_idx][content_idx] - # Always add the current message as a new version entry. - # The timestamp will differentiate it even if content is identical to a previous version. history['metadata'][key]["versions"].append({ "content": current_content, "visible_content": current_visible, @@ -534,6 +536,13 @@ def add_message_attachment(history, row_idx, file_path, is_user=True): "type": "application/pdf", "content": content, } + elif file_extension == '.docx': + content = extract_docx_text(path) + attachment = { + "name": filename, + "type": "application/docx", + "content": content, + } else: # Default handling for text files with open(path, 'r', encoding='utf-8') as f: @@ -569,6 +578,79 @@ def extract_pdf_text(pdf_path): return f"[Error extracting PDF text: {str(e)}]" +def extract_docx_text(docx_path): + """ + Extract text from a .docx file, including headers, + body (paragraphs and tables), and footers. + """ + try: + import docx + + doc = docx.Document(docx_path) + parts = [] + + # 1) Extract non-empty header paragraphs from each section + for section in doc.sections: + for para in section.header.paragraphs: + text = para.text.strip() + if text: + parts.append(text) + + # 2) Extract body blocks (paragraphs and tables) in document order + parent_elm = doc.element.body + for child in parent_elm.iterchildren(): + if isinstance(child, docx.oxml.text.paragraph.CT_P): + para = docx.text.paragraph.Paragraph(child, doc) + text = para.text.strip() + if text: + parts.append(text) + + elif isinstance(child, docx.oxml.table.CT_Tbl): + table = docx.table.Table(child, doc) + for row in table.rows: + cells = [cell.text.strip() for cell in row.cells] + parts.append("\t".join(cells)) + + # 3) Extract non-empty footer paragraphs from each section + for section in doc.sections: + for para in section.footer.paragraphs: + text = para.text.strip() + if text: + parts.append(text) + + return "\n".join(parts) + + except Exception as e: + logger.error(f"Error extracting text from DOCX: {e}") + return f"[Error extracting DOCX text: {str(e)}]" + + +def generate_search_query(user_message, state): + """Generate a search query from user message using the LLM""" + # Augment the user message with search instruction + augmented_message = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else." + + # Use a minimal state for search query generation but keep the full history + search_state = state.copy() + search_state['max_new_tokens'] = 64 + search_state['auto_max_new_tokens'] = False + search_state['enable_thinking'] = False + + # Generate the full prompt using existing history + augmented message + formatted_prompt = generate_chat_prompt(augmented_message, search_state) + + query = "" + for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True): + query = reply + + # Strip and remove surrounding quotes if present + query = query.strip() + if len(query) >= 2 and query.startswith('"') and query.endswith('"'): + query = query[1:-1] + + return query + + def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False): # Handle dict format with text and files files = [] @@ -614,7 +696,9 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess state['image_attachments'] = image_attachments # Add web search results as attachments if enabled - add_web_search_attachments(output, row_idx, text, state) + if state.get('enable_web_search', False): + search_query = generate_search_query(text, state) + add_web_search_attachments(output, row_idx, text, search_query, state) # Apply extensions text, visible_text = apply_extensions('chat_input', text, visible_text, state) @@ -638,9 +722,18 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess if regenerate: row_idx = len(output['internal']) - 1 - # Store the first response as a version before regenerating + # Store the old response as a version before regenerating if not output['metadata'].get(f"assistant_{row_idx}", {}).get('versions'): - add_message_version(output, row_idx, is_current=False) + add_message_version(output, "assistant", row_idx, is_current=False) + + # Add new empty version (will be filled during streaming) + key = f"assistant_{row_idx}" + output['metadata'][key]["versions"].append({ + "content": "", + "visible_content": "", + "timestamp": get_current_timestamp() + }) + output['metadata'][key]["current_version_index"] = len(output['metadata'][key]["versions"]) - 1 if loading_message: yield { @@ -672,7 +765,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess # Add timestamp for assistant's response at the start of generation row_idx = len(output['internal']) - 1 - update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp()) + update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp(), model_name=shared.model_name) # Generate reply = None @@ -694,33 +787,51 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess if _continue: output['internal'][-1] = [text, last_reply[0] + reply] output['visible'][-1] = [visible_text, last_reply[1] + visible_reply] - if is_stream: - yield output elif not (j == 0 and visible_reply.strip() == ''): output['internal'][-1] = [text, reply.lstrip(' ')] output['visible'][-1] = [visible_text, visible_reply.lstrip(' ')] - if is_stream: - yield output - # Add the newly generated response as a version (only for regeneration) - if regenerate: - row_idx = len(output['internal']) - 1 - add_message_version(output, row_idx, is_current=True) + # Keep version metadata in sync during streaming (for regeneration) + if regenerate: + row_idx = len(output['internal']) - 1 + key = f"assistant_{row_idx}" + current_idx = output['metadata'][key]['current_version_index'] + output['metadata'][key]['versions'][current_idx].update({ + 'content': output['internal'][row_idx][1], + 'visible_content': output['visible'][row_idx][1] + }) + + if is_stream: + yield output output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True) + + # Final sync for version metadata (in case streaming was disabled) + if regenerate: + row_idx = len(output['internal']) - 1 + key = f"assistant_{row_idx}" + current_idx = output['metadata'][key]['current_version_index'] + output['metadata'][key]['versions'][current_idx].update({ + 'content': output['internal'][row_idx][1], + 'visible_content': output['visible'][row_idx][1] + }) + yield output -def impersonate_wrapper(text, state): +def impersonate_wrapper(textbox, state): + text = textbox['text'] static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) prompt = generate_chat_prompt('', state, impersonate=True) stopping_strings = get_stopping_strings(state) - yield text + '...', static_output + textbox['text'] = text + '...' + yield textbox, static_output reply = None for reply in generate_reply(prompt + text, state, stopping_strings=stopping_strings, is_chat=True): - yield (text + reply).lstrip(' '), static_output + textbox['text'] = (text + reply).lstrip(' ') + yield textbox, static_output if shared.stop_everything: return @@ -769,7 +880,9 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False): last_save_time = time.monotonic() save_interval = 8 for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)): - yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history + yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'], last_message_only=(i > 0)), history + if i == 0: + time.sleep(0.125) # We need this to make sure the first update goes through current_time = time.monotonic() # Save on first iteration or if save_interval seconds have passed @@ -800,9 +913,12 @@ def remove_last_message(history): return html.unescape(last[0]), history -def send_dummy_message(textbox, state): +def send_dummy_message(text, state): history = state['history'] - text = textbox['text'] + + # Handle both dict and string inputs + if isinstance(text, dict): + text = text['text'] # Initialize metadata if not present if 'metadata' not in history: @@ -816,9 +932,12 @@ def send_dummy_message(textbox, state): return history -def send_dummy_reply(textbox, state): +def send_dummy_reply(text, state): history = state['history'] - text = textbox['text'] + + # Handle both dict and string inputs + if isinstance(text, dict): + text = text['text'] # Initialize metadata if not present if 'metadata' not in history: @@ -1487,76 +1606,76 @@ def handle_edit_message_click(state): if message_index >= len(history['internal']): html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html_output, gr.update()] + return [history, html_output] - # Use the role passed from frontend - is_user_msg = (role == "user") - role_idx = 0 if is_user_msg else 1 + role_idx = 0 if role == "user" else 1 - # For assistant messages, save the original version BEFORE updating content - if not is_user_msg: - if not history['metadata'].get(f"assistant_{message_index}", {}).get('versions'): - add_message_version(history, message_index, is_current=False) + if 'metadata' not in history: + history['metadata'] = {} + + key = f"{role}_{message_index}" + if key not in history['metadata']: + history['metadata'][key] = {} + + # If no versions exist yet for this message, store the current (pre-edit) content as the first version. + if "versions" not in history['metadata'][key] or not history['metadata'][key]["versions"]: + original_content = history['internal'][message_index][role_idx] + original_visible = history['visible'][message_index][role_idx] + original_timestamp = history['metadata'][key].get('timestamp', get_current_timestamp()) + + history['metadata'][key]["versions"] = [{ + "content": original_content, + "visible_content": original_visible, + "timestamp": original_timestamp + }] - # NOW update the message content history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True) history['visible'][message_index][role_idx] = html.escape(new_text) - # Branch if editing user message, add version if editing assistant message - if is_user_msg: - # Branch like branch-here - history['visible'] = history['visible'][:message_index + 1] - history['internal'] = history['internal'][:message_index + 1] - new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S') - save_history(history, new_unique_id, state['character_menu'], state['mode']) - histories = find_all_histories_with_first_prompts(state) - past_chats_update = gr.update(choices=histories, value=new_unique_id) - state['unique_id'] = new_unique_id - elif not is_user_msg: - # Add the new version as current - add_message_version(history, message_index, is_current=True) - past_chats_update = gr.update() - else: - past_chats_update = gr.update() + add_message_version(history, role, message_index, is_current=True) save_history(history, state['unique_id'], state['character_menu'], state['mode']) html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html_output, past_chats_update] + return [history, html_output] def handle_navigate_version_click(state): history = state['history'] message_index = int(state['navigate_message_index']) direction = state['navigate_direction'] + role = state['navigate_message_role'] - # Get assistant message metadata - key = f"assistant_{message_index}" - if key not in history['metadata'] or 'versions' not in history['metadata'][key]: - # No versions to navigate + if not role: + logger.error("Role not provided for version navigation.") + html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + return [history, html] + + key = f"{role}_{message_index}" + if 'metadata' not in history or key not in history['metadata'] or 'versions' not in history['metadata'][key]: html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) return [history, html] metadata = history['metadata'][key] - current_idx = metadata.get('current_version_index', 0) versions = metadata['versions'] + # Default to the last version if current_version_index is not set + current_idx = metadata.get('current_version_index', len(versions) - 1 if versions else 0) - # Calculate new index if direction == 'left': new_idx = max(0, current_idx - 1) else: # right new_idx = min(len(versions) - 1, current_idx + 1) if new_idx == current_idx: - # No change needed html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) return [history, html] - # Update history with new version - version = versions[new_idx] - history['internal'][message_index][1] = version['content'] - history['visible'][message_index][1] = version['visible_content'] + msg_content_idx = 0 if role == 'user' else 1 # 0 for user content, 1 for assistant content in the pair + version_to_load = versions[new_idx] + history['internal'][message_index][msg_content_idx] = version_to_load['content'] + history['visible'][message_index][msg_content_idx] = version_to_load['visible_content'] metadata['current_version_index'] = new_idx + update_message_metadata(history['metadata'], role, message_index, timestamp=version_to_load['timestamp']) # Redraw and save html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) From 7366ff5dfa760f84cd0ff69c460888791466fd95 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 5 Jun 2025 10:49:02 -0700 Subject: [PATCH 09/10] Change a class name --- css/main.css | 2 +- modules/html_generator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/css/main.css b/css/main.css index 268ddb74..10089b1d 100644 --- a/css/main.css +++ b/css/main.css @@ -1555,7 +1555,7 @@ strong { flex-direction: column; } -.attachment-image { +.image-preview { border-radius: 16px; margin-bottom: 5px; object-fit: cover; diff --git a/modules/html_generator.py b/modules/html_generator.py index aa037314..44e0b236 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -377,7 +377,7 @@ def format_message_attachments(history, role, index): file_path = attachment.get("file_path", "") attachments_html += ( f'
' - f'{name}' + f'{name}' f'
{name}
' f'
' ) From 0783f5c891badc96146d654d1f2bcbdd99f433af Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 5 Jun 2025 11:42:12 -0700 Subject: [PATCH 10/10] Use the latest format --- extensions/openai/completions.py | 2 +- modules/chat.py | 21 +++++++++++++++++---- modules/llama_cpp_server.py | 12 ++++++------ 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 2374733a..4e4e310f 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -150,7 +150,7 @@ def convert_history(history): # Process multimodal content processed_content, images = process_multimodal_content(content) if images: - image_refs = "".join(f"[img-{img['image_id']}]" for img in images) + image_refs = "".join("<__media__>" for img in images) processed_content = f"{processed_content} {image_refs}" user_input = processed_content diff --git a/modules/chat.py b/modules/chat.py index 275f28f9..9dc8d1fd 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -225,7 +225,7 @@ def generate_chat_prompt(user_input, state, **kwargs): for attachment in metadata[user_key]["attachments"]: if attachment.get("type") == "image": # Add image reference for multimodal models - image_refs += f"[img-{attachment['image_id']}]" + image_refs += "<__media__>" else: # Handle text/PDF attachments as before filename = attachment.get("name", "file") @@ -260,7 +260,7 @@ def generate_chat_prompt(user_input, state, **kwargs): for attachment in metadata[user_key]["attachments"]: if attachment.get("type") == "image": - image_refs += f"[img-{attachment['image_id']}]" + image_refs += "<__media__>" else: filename = attachment.get("name", "file") content = attachment.get("content", "") @@ -517,17 +517,30 @@ def add_message_attachment(history, row_idx, file_path, is_user=True): with open(path, 'rb') as f: image_data = base64.b64encode(f.read()).decode('utf-8') + # Determine MIME type from extension + mime_type_map = { + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.png': 'image/png', + '.webp': 'image/webp', + '.bmp': 'image/bmp', + '.gif': 'image/gif' + } + mime_type = mime_type_map.get(file_extension, 'image/jpeg') + + # Format as data URL + data_url = f"data:{mime_type};base64,{image_data}" + # Generate unique image ID image_id = len([att for att in history['metadata'][key]["attachments"] if att.get("type") == "image"]) + 1 attachment = { "name": filename, "type": "image", - "image_data": image_data, + "image_data": data_url, "image_id": image_id, "file_path": str(path) # For UI preview } - elif file_extension == '.pdf': # Process PDF file content = extract_pdf_text(path) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 8b4ed7a7..ca1b2c47 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -123,15 +123,15 @@ class LlamaServer: # Add image data if present if 'image_attachments' in state: - image_data = [] + medias = [] for attachment in state['image_attachments']: - image_data.append({ - "data": attachment['image_data'], - "id": attachment['image_id'] + medias.append({ + "type": "image", + "data": attachment['image_data'] }) - if image_data: - payload["image_data"] = image_data + if medias: + payload["medias"] = medias return payload