From 27641ac1823751165615a1a53b62ae24977e37a0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 17:09:05 -0700
Subject: [PATCH 01/59] UI: Make message editing work the same for user and
 assistant messages

---
 js/global_scope_js.js     | 28 ++++++------
 modules/chat.py           | 94 ++++++++++++++++++++-------------------
 modules/html_generator.py | 42 ++++++++++-------
 modules/ui.py             |  3 +-
 modules/ui_chat.py        |  4 +-
 5 files changed, 94 insertions(+), 77 deletions(-)

diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 0e86d450..3274f47e 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -186,31 +186,33 @@ function navigateVersion(element, direction) {
   const index = messageElement.getAttribute("data-index");
   if (!index) return;
 
-  const indexInput = document.getElementById("Navigate-message-index").querySelector("input");
-  if (!indexInput) {
-    console.error("Element with ID 'Navigate-message-index' not found.");
-    return;
-  }
-
-  const directionInput = document.getElementById("Navigate-direction").querySelector("textarea");
-  if (!directionInput) {
-    console.error("Element with ID 'Navigate-direction' not found.");
-    return;
+  // Determine role based on message element classes
+  let role = "assistant"; // Default role
+  if (messageElement.classList.contains("user-message") ||
+      messageElement.querySelector(".text-you") ||
+      messageElement.querySelector(".circle-you")) {
+    role = "user";
   }
 
+  const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input");
+  const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea");
+  const roleInput = document.getElementById("Navigate-message-role")?.querySelector("textarea");
   const navigateButton = document.getElementById("Navigate-version");
-  if (!navigateButton) {
-    console.error("Required element 'Navigate-version' not found.");
+
+  if (!indexInput || !directionInput || !roleInput || !navigateButton) {
+    console.error("Navigation control elements (index, direction, role, or button) not found.");
     return;
   }
 
   indexInput.value = index;
   directionInput.value = direction;
+  roleInput.value = role;
 
-  // Trigger any 'change' or 'input' events Gradio might be listening for
+  // Trigger 'input' events for Gradio to pick up changes
   const event = new Event("input", { bubbles: true });
   indexInput.dispatchEvent(event);
   directionInput.dispatchEvent(event);
+  roleInput.dispatchEvent(event);
 
   navigateButton.click();
 }
diff --git a/modules/chat.py b/modules/chat.py
index b2aacd5c..8bac680c 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -451,19 +451,21 @@ def get_stopping_strings(state):
     return result
 
 
-def add_message_version(history, row_idx, is_current=True):
-    key = f"assistant_{row_idx}"
+def add_message_version(history, role, row_idx, is_current=True):
+    key = f"{role}_{row_idx}"
+    if 'metadata' not in history:
+        history['metadata'] = {}
     if key not in history['metadata']:
         history['metadata'][key] = {}
 
     if "versions" not in history['metadata'][key]:
         history['metadata'][key]["versions"] = []
 
-    current_content = history['internal'][row_idx][1]
-    current_visible = history['visible'][row_idx][1]
+    # Determine which index to use for content based on role
+    content_idx = 0 if role == 'user' else 1
+    current_content = history['internal'][row_idx][content_idx]
+    current_visible = history['visible'][row_idx][content_idx]
 
-    # Always add the current message as a new version entry.
-    # The timestamp will differentiate it even if content is identical to a previous version.
     history['metadata'][key]["versions"].append({
         "content": current_content,
         "visible_content": current_visible,
@@ -594,7 +596,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
             # Store the first response as a version before regenerating
             if not output['metadata'].get(f"assistant_{row_idx}", {}).get('versions'):
-                add_message_version(output, row_idx, is_current=False)
+                add_message_version(output, "assistant", row_idx, is_current=False)
 
             if loading_message:
                 yield {
@@ -656,12 +658,13 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             if is_stream:
                 yield output
 
+    output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
+
     # Add the newly generated response as a version (only for regeneration)
     if regenerate:
         row_idx = len(output['internal']) - 1
-        add_message_version(output, row_idx, is_current=True)
+        add_message_version(output, "assistant", row_idx, is_current=True)
 
-    output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
     yield output
 
 
@@ -1441,37 +1444,35 @@ def handle_edit_message_click(state):
 
     if message_index >= len(history['internal']):
         html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
-        return [history, html_output, gr.update()]
+        return [history, html_output, gr.update()]  # No unique_id change
 
-    # Use the role passed from frontend
-    is_user_msg = (role == "user")
-    role_idx = 0 if is_user_msg else 1
+    role_idx = 0 if role == "user" else 1
 
-    # For assistant messages, save the original version BEFORE updating content
-    if not is_user_msg:
-        if not history['metadata'].get(f"assistant_{message_index}", {}).get('versions'):
-            add_message_version(history, message_index, is_current=False)
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
+    key = f"{role}_{message_index}"
+    if key not in history['metadata']:
+        history['metadata'][key] = {}
+
+    # If no versions exist yet for this message, store the current (pre-edit) content as the first version.
+    if "versions" not in history['metadata'][key] or not history['metadata'][key]["versions"]:
+        original_content = history['internal'][message_index][role_idx]
+        original_visible = history['visible'][message_index][role_idx]
+
+        history['metadata'][key]["versions"] = [{
+            "content": original_content,
+            "visible_content": original_visible,
+            "timestamp": get_current_timestamp()
+        }]
 
-    # NOW update the message content
     history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True)
     history['visible'][message_index][role_idx] = html.escape(new_text)
 
-    # Branch if editing user message, add version if editing assistant message
-    if is_user_msg:
-        # Branch like branch-here
-        history['visible'] = history['visible'][:message_index + 1]
-        history['internal'] = history['internal'][:message_index + 1]
-        new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
-        save_history(history, new_unique_id, state['character_menu'], state['mode'])
-        histories = find_all_histories_with_first_prompts(state)
-        past_chats_update = gr.update(choices=histories, value=new_unique_id)
-        state['unique_id'] = new_unique_id
-    elif not is_user_msg:
-        # Add the new version as current
-        add_message_version(history, message_index, is_current=True)
-        past_chats_update = gr.update()
-    else:
-        past_chats_update = gr.update()
+    add_message_version(history, role, message_index, is_current=True)
+
+    # Since we are not branching, unique_id does not change.
+    past_chats_update = gr.update()
 
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
     html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
@@ -1483,33 +1484,36 @@ def handle_navigate_version_click(state):
     history = state['history']
     message_index = int(state['navigate_message_index'])
     direction = state['navigate_direction']
+    role = state['navigate_message_role']
 
-    # Get assistant message metadata
-    key = f"assistant_{message_index}"
-    if key not in history['metadata'] or 'versions' not in history['metadata'][key]:
-        # No versions to navigate
+    if not role:
+        logger.error("Role not provided for version navigation.")
+        html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+        return [history, html]
+
+    key = f"{role}_{message_index}"
+    if 'metadata' not in history or key not in history['metadata'] or 'versions' not in history['metadata'][key]:
         html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
         return [history, html]
 
     metadata = history['metadata'][key]
-    current_idx = metadata.get('current_version_index', 0)
     versions = metadata['versions']
+    # Default to the last version if current_version_index is not set
+    current_idx = metadata.get('current_version_index', len(versions) - 1 if versions else 0)
 
-    # Calculate new index
     if direction == 'left':
         new_idx = max(0, current_idx - 1)
     else:  # right
         new_idx = min(len(versions) - 1, current_idx + 1)
 
     if new_idx == current_idx:
-        # No change needed
         html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
         return [history, html]
 
-    # Update history with new version
-    version = versions[new_idx]
-    history['internal'][message_index][1] = version['content']
-    history['visible'][message_index][1] = version['visible_content']
+    msg_content_idx = 0 if role == 'user' else 1  # 0 for user content, 1 for assistant content in the pair
+    version_to_load = versions[new_idx]
+    history['internal'][message_index][msg_content_idx] = version_to_load['content']
+    history['visible'][message_index][msg_content_idx] = version_to_load['visible_content']
     metadata['current_version_index'] = new_idx
 
     # Redraw and save
diff --git a/modules/html_generator.py b/modules/html_generator.py
index bfb278cd..cbf3e19c 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -388,16 +388,17 @@ def format_message_attachments(history, role, index):
     return ""
 
 
-def get_version_navigation_html(history, i):
+def get_version_navigation_html(history, i, role):
     """Generate simple navigation arrows for message versions"""
-    key = f"assistant_{i}"
+    key = f"{role}_{i}"
     metadata = history.get('metadata', {})
 
     if key not in metadata or 'versions' not in metadata[key]:
         return ""
 
     versions = metadata[key]['versions']
-    current_idx = metadata[key].get('current_version_index', 0)
+    # Default to the last version if current_version_index isn't set in metadata
+    current_idx = metadata[key].get('current_version_index', len(versions) - 1 if versions else 0)
 
     if len(versions) <= 1:
         return ""
@@ -413,22 +414,33 @@ def get_version_navigation_html(history, i):
 
 
 def actions_html(history, i, role, info_message=""):
+    action_buttons = ""
+    version_nav_html = ""
+
     if role == "assistant":
-        return (f'<div class="message-actions">'
-                f'{copy_button}'
-                f'{edit_button}'
-                f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
-                f'{continue_button if i == len(history["visible"]) - 1 else ""}'
-                f'{remove_button if i == len(history["visible"]) - 1 else ""}'
-                f'{branch_button}'
-                f'{info_message}'
-                f'</div>'
-                f'{get_version_navigation_html(history, i)}')
-    return (f'<div class="message-actions">'
+        action_buttons = (
             f'{copy_button}'
             f'{edit_button}'
+            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
+            f'{continue_button if i == len(history["visible"]) - 1 else ""}'
+            f'{remove_button if i == len(history["visible"]) - 1 else ""}'
+            f'{branch_button}'
+        )
+
+        version_nav_html = get_version_navigation_html(history, i, "assistant")
+    elif role == "user":
+        action_buttons = (
+            f'{copy_button}'
+            f'{edit_button}'
+        )
+
+        version_nav_html = get_version_navigation_html(history, i, "user")
+
+    return (f'<div class="message-actions">'
+            f'{action_buttons}'
             f'{info_message}'
-            f'</div>')
+            f'</div>'
+            f'{version_nav_html}')
 
 
 def generate_instruct_html(history):
diff --git a/modules/ui.py b/modules/ui.py
index e24e6402..a2662e14 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -212,14 +212,13 @@ def list_interface_input_elements():
         'grammar_string',
         'navigate_message_index',
         'navigate_direction',
+        'navigate_message_role',
         'edit_message_index',
         'edit_message_text',
         'edit_message_role',
         'branch_index',
         'enable_web_search',
         'web_search_pages',
-        'navigate_message_index',
-        'navigate_direction',
     ]
 
     # Chat elements
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 719af85a..df3d3929 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -110,6 +110,7 @@ def create_ui():
         with gr.Row(visible=False):
             shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index")
             shared.gradio['navigate_direction'] = gr.Textbox(value="", elem_id="Navigate-direction")
+            shared.gradio['navigate_message_role'] = gr.Textbox(value="", elem_id="Navigate-message-role")
             shared.gradio['navigate_version'] = gr.Button(elem_id="Navigate-version")
             shared.gradio['edit_message_index'] = gr.Number(value=-1, precision=0, elem_id="Edit-message-index")
             shared.gradio['edit_message_text'] = gr.Textbox(value="", elem_id="Edit-message-text")
@@ -313,8 +314,7 @@ def create_event_handlers():
 
     shared.gradio['edit_message'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False).then(
-        lambda: None, None, None, js='() => { const role = document.getElementById("Edit-message-role").querySelector("textarea").value; if (role === "user") document.getElementById("Regenerate").click(); }')
+        chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
 
     # Save/delete a character
     shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False)

From 3eb0b77427ad7b87c128999fd915f97b22104819 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 18:14:51 -0700
Subject: [PATCH 02/59] Improve the web search query generation

---
 modules/chat.py       | 25 ++++++++++++++++++++++++-
 modules/web_search.py | 29 ++++-------------------------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 8bac680c..495fe934 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -538,6 +538,27 @@ def extract_pdf_text(pdf_path):
         return f"[Error extracting PDF text: {str(e)}]"
 
 
+def generate_search_query(user_message, state):
+    """Generate a search query from user message using the LLM"""
+    # Augment the user message with search instruction
+    augmented_message = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else."
+
+    # Use a minimal state for search query generation but keep the full history
+    search_state = state.copy()
+    search_state['max_new_tokens'] = 64
+    search_state['auto_max_new_tokens'] = False
+    search_state['enable_thinking'] = False
+
+    # Generate the full prompt using existing history + augmented message
+    formatted_prompt = generate_chat_prompt(augmented_message, search_state)
+
+    query = ""
+    for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True):
+        query = reply.strip()
+
+    return query
+
+
 def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False):
     # Handle dict format with text and files
     files = []
@@ -570,7 +591,9 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             add_message_attachment(output, row_idx, file_path, is_user=True)
 
         # Add web search results as attachments if enabled
-        add_web_search_attachments(output, row_idx, text, state)
+        if state.get('enable_web_search', False):
+            search_query = generate_search_query(text, state)
+            add_web_search_attachments(output, row_idx, text, search_query, state)
 
         # Apply extensions
         text, visible_text = apply_extensions('chat_input', text, visible_text, state)
diff --git a/modules/web_search.py b/modules/web_search.py
index d3387ac9..667178c5 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -13,22 +13,6 @@ def get_current_timestamp():
     return datetime.now().strftime('%b %d, %Y %H:%M')
 
 
-def generate_search_query(user_message, state):
-    """Generate a search query from user message using the LLM"""
-    search_prompt = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else."
-
-    # Use a minimal state for search query generation
-    search_state = state.copy()
-    search_state['max_new_tokens'] = 64
-    search_state['temperature'] = 0.1
-
-    query = ""
-    for reply in generate_reply(search_prompt, search_state, stopping_strings=[], is_chat=False):
-        query = reply.strip()
-
-    return query
-
-
 def download_web_page(url, timeout=10):
     """Download and extract text from a web page"""
     try:
@@ -82,19 +66,14 @@ def perform_web_search(query, num_pages=3):
         return []
 
 
-def add_web_search_attachments(history, row_idx, user_message, state):
+def add_web_search_attachments(history, row_idx, user_message, search_query, state):
     """Perform web search and add results as attachments"""
-    if not state.get('enable_web_search', False):
+    if not search_query:
+        logger.warning("No search query provided")
         return
 
     try:
-        # Generate search query
-        search_query = generate_search_query(user_message, state)
-        if not search_query:
-            logger.warning("Failed to generate search query")
-            return
-
-        logger.info(f"Generated search query: {search_query}")
+        logger.info(f"Using search query: {search_query}")
 
         # Perform web search
         num_pages = int(state.get('web_search_pages', 3))

From 7080a02252b9949297950ef3669361d21f4a6bcf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 18:15:21 -0700
Subject: [PATCH 03/59] Reduce the timeout for downloading web pages

---
 modules/web_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/web_search.py b/modules/web_search.py
index 667178c5..070f850c 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -13,7 +13,7 @@ def get_current_timestamp():
     return datetime.now().strftime('%b %d, %Y %H:%M')
 
 
-def download_web_page(url, timeout=10):
+def download_web_page(url, timeout=5):
     """Download and extract text from a web page"""
     try:
         headers = {

From 75d6cfd14d1aed5ba19bd747479794cbd34212d0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 20:34:14 -0700
Subject: [PATCH 04/59] Download fetched web search results in parallel

---
 modules/web_search.py | 44 +++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/modules/web_search.py b/modules/web_search.py
index 070f850c..1f670349 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -1,3 +1,5 @@
+import concurrent.futures
+from concurrent.futures import as_completed
 from datetime import datetime
 
 import requests
@@ -5,7 +7,6 @@ from bs4 import BeautifulSoup
 from duckduckgo_search import DDGS
 
 from modules.logging_colors import logger
-from modules.text_generation import generate_reply
 
 
 def get_current_timestamp():
@@ -40,27 +41,50 @@ def download_web_page(url, timeout=5):
         return f"[Error downloading content from {url}: {str(e)}]"
 
 
-def perform_web_search(query, num_pages=3):
+def perform_web_search(query, num_pages=3, max_workers=5):
     """Perform web search and return results with content"""
     try:
         with DDGS() as ddgs:
             results = list(ddgs.text(query, max_results=num_pages))
 
-        search_results = []
+        # Prepare download tasks
+        download_tasks = []
         for i, result in enumerate(results):
             url = result.get('href', '')
             title = result.get('title', f'Search Result {i+1}')
+            download_tasks.append((url, title, i))
 
-            # Download page content
-            content = download_web_page(url)
+        search_results = [None] * len(download_tasks)  # Pre-allocate to maintain order
 
-            search_results.append({
-                'title': title,
-                'url': url,
-                'content': content
-            })
+        # Download pages in parallel
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all download tasks
+            future_to_task = {
+                executor.submit(download_web_page, task[0]): task
+                for task in download_tasks
+            }
+
+            # Collect results as they complete
+            for future in as_completed(future_to_task):
+                url, title, index = future_to_task[future]
+                try:
+                    content = future.result()
+                    search_results[index] = {
+                        'title': title,
+                        'url': url,
+                        'content': content
+                    }
+                except Exception as e:
+                    logger.error(f"Error downloading {url}: {e}")
+                    # Include failed downloads with empty content
+                    search_results[index] = {
+                        'title': title,
+                        'url': url,
+                        'content': ''
+                    }
 
         return search_results
+
     except Exception as e:
         logger.error(f"Error performing web search: {e}")
         return []

From 63234b9b6f60ec4f276480b4e7f9d4cd1395dcaf Mon Sep 17 00:00:00 2001
From: Underscore <47636331+Th-Underscore@users.noreply.github.com>
Date: Thu, 29 May 2025 07:22:03 -0400
Subject: [PATCH 05/59] UI: Fix impersonate (#7025)

---
 modules/chat.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 495fe934..7afd906d 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -691,16 +691,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     yield output
 
 
-def impersonate_wrapper(text, state):
+def impersonate_wrapper(textbox, state):
+    text = textbox['text']
     static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
     prompt = generate_chat_prompt('', state, impersonate=True)
     stopping_strings = get_stopping_strings(state)
 
-    yield text + '...', static_output
+    textbox['text'] = text + '...'
+    yield textbox, static_output
     reply = None
     for reply in generate_reply(prompt + text, state, stopping_strings=stopping_strings, is_chat=True):
-        yield (text + reply).lstrip(' '), static_output
+        textbox['text'] = (text + reply).lstrip(' ')
+        yield textbox, static_output
         if shared.stop_everything:
             return
 

From a8d02dec8f5e6a054a153b3b09425b51e090ae11 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 04:24:21 -0700
Subject: [PATCH 06/59] Bump llama.cpp

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 16 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 0eaf10da..5f61aff9 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 65f184bf..a718b6ca 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index d20b2ec3..5fddc623 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 2613d787..8e014445 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index af583b00..77779f3d 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -32,8 +32,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 9bf2a37d..79efc607 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -32,5 +32,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 1731448e..8b29453e 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -32,5 +32,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index fc481a1a..f1f4a02e 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index fdae681d..adf50d9a 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index a58f39f7..46b36791 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 91ea3a6d..66052711 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 37e5aa40..4013abcc 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index dcb2884b..41808854 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 8f1295bb..cff79ec6 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 858b4488..762b3fa3 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 569bae99..b425d305 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 685cfe254036111711de027f6d3a8198d02e7545 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 04:26:43 -0700
Subject: [PATCH 07/59] Lint

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 181a19b8..8af87b42 100644
--- a/css/main.css
+++ b/css/main.css
@@ -265,7 +265,7 @@ button {
 
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb,
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
-    background: rgba(255, 255, 255, 0.2);
+    background: rgb(255 255 255 / 20%);
     border-radius: 10px;
 }
 

From f2ee917d4f600ebbc5fa9d5fcf65cf5feef27fc1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 04:55:05 -0700
Subject: [PATCH 08/59] Update README

---
 README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7105ce23..afb21cb0 100644
--- a/README.md
+++ b/README.md
@@ -14,14 +14,17 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
 - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
+- **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents.
+- **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
+- **Advanced chat management**: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point.
+- **Automatic prompt formatting** using Jinja2 templates. You don't need to ever worry about prompt formats.
 - UI that resembles the original ChatGPT style.
-- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
 - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
 - Multiple sampling parameters and generation options for sophisticated text generation control.
 - Switch between different models easily in the UI without restarting, with fine control over settings.
 - OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
-- 100% offline and private, with zero telemetry, external resources, or remote update requests.
+- 100% offline and private, with zero telemetry, external resources, or remote update requests. Web search is optional and user-controlled.
 - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 
 ## How to install

From 2a9699033d90f4ffedfb22cbba7003c6441d08dc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 04:55:59 -0700
Subject: [PATCH 09/59] Update README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index afb21cb0..05809436 100644
--- a/README.md
+++ b/README.md
@@ -16,8 +16,8 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
 - **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents.
 - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
-- **Advanced chat management**: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point.
-- **Automatic prompt formatting** using Jinja2 templates. You don't need to ever worry about prompt formats.
+- Advanced chat management: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point.
+- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
 - UI that resembles the original ChatGPT style.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
 - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.

From 9a94d7b4f6ae95b6b4b2fc521b5b25c300915dc9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 05:02:52 -0700
Subject: [PATCH 10/59] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 05809436..900d5fbd 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## Features
 
 - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
-- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
+- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory without affecting your system.
 - **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents.
 - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
 - Advanced chat management: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point.

From 0986d075fb22dc5aa582bbefdfdb0ebdb6ee92c8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 05:03:59 -0700
Subject: [PATCH 11/59] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 900d5fbd..ec01c0aa 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory without affecting your system.
 - **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents.
 - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
-- Advanced chat management: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point.
+- Advanced chat management: Edit messages, navigate between message versions, and branch conversations at any point.
 - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
 - UI that resembles the original ChatGPT style.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.

From 36bc2760058ed4e6998f4c55176c7311b0facabe Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 05:39:26 -0700
Subject: [PATCH 12/59] Update README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index ec01c0aa..9accffb7 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
 - Advanced chat management: Edit messages, navigate between message versions, and branch conversations at any point.
 - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
+- Automatic GPU layers for GGUF models (on NVIDIA GPUs).
 - UI that resembles the original ChatGPT style.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
 - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.

From 81794692ab6fbc0ef24c7484b6571de090984dde Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 08:07:14 -0700
Subject: [PATCH 13/59] UI: Make the dark theme darker

---
 css/main.css | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/css/main.css b/css/main.css
index 8af87b42..0d0a13cf 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1,11 +1,11 @@
 :root {
     --darker-gray: #202123;
-    --dark-gray: #343541;
-    --light-gray: #444654;
+    --dark-gray: #2A2B32;
+    --light-gray: #373943;
     --light-theme-gray: #f9fbff;
     --border-color-dark: #525252;
     --header-width: 112px;
-    --selected-item-color-dark: #32333e;
+    --selected-item-color-dark: #2E2F38;
 }
 
 @font-face {

From c970c5f1665c3966c84ba50a05a45d2598038ea6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 08:15:13 -0700
Subject: [PATCH 14/59] Make scrollbars darker in dark theme

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 0d0a13cf..7f9d4618 100644
--- a/css/main.css
+++ b/css/main.css
@@ -265,7 +265,7 @@ button {
 
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb,
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
-    background: rgb(255 255 255 / 20%);
+    background: rgb(255 255 255 / 10%);
     border-radius: 10px;
 }
 

From 3f37a2e915a31b273caddd12a80412a199d753a7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 08:49:31 -0700
Subject: [PATCH 15/59] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9accffb7..361584f8 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## Features
 
 - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
-- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory without affecting your system.
+- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
 - **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents.
 - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
 - Advanced chat management: Edit messages, navigate between message versions, and branch conversations at any point.

From faa5c82c64e2036762ed3ff60a38fc5b37dac36d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 09:02:34 -0700
Subject: [PATCH 16/59] Fix message version count not updating during
 regeneration streaming

---
 modules/chat.py | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 7afd906d..90d66687 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -617,10 +617,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         if regenerate:
             row_idx = len(output['internal']) - 1
 
-            # Store the first response as a version before regenerating
+            # Store the old response as a version before regenerating
             if not output['metadata'].get(f"assistant_{row_idx}", {}).get('versions'):
                 add_message_version(output, "assistant", row_idx, is_current=False)
 
+            # Add new empty version (will be filled during streaming)
+            key = f"assistant_{row_idx}"
+            output['metadata'][key]["versions"].append({
+                "content": "",
+                "visible_content": "",
+                "timestamp": get_current_timestamp()
+            })
+            output['metadata'][key]["current_version_index"] = len(output['metadata'][key]["versions"]) - 1
+
             if loading_message:
                 yield {
                     'visible': output['visible'][:-1] + [[visible_text, shared.processing_message]],
@@ -673,20 +682,34 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         if _continue:
             output['internal'][-1] = [text, last_reply[0] + reply]
             output['visible'][-1] = [visible_text, last_reply[1] + visible_reply]
-            if is_stream:
-                yield output
         elif not (j == 0 and visible_reply.strip() == ''):
             output['internal'][-1] = [text, reply.lstrip(' ')]
             output['visible'][-1] = [visible_text, visible_reply.lstrip(' ')]
-            if is_stream:
-                yield output
+
+        # Keep version metadata in sync during streaming (for regeneration)
+        if regenerate:
+            row_idx = len(output['internal']) - 1
+            key = f"assistant_{row_idx}"
+            current_idx = output['metadata'][key]['current_version_index']
+            output['metadata'][key]['versions'][current_idx].update({
+                'content': output['internal'][row_idx][1],
+                'visible_content': output['visible'][row_idx][1]
+            })
+
+        if is_stream:
+            yield output
 
     output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
 
-    # Add the newly generated response as a version (only for regeneration)
+    # Final sync for version metadata (in case streaming was disabled)
     if regenerate:
         row_idx = len(output['internal']) - 1
-        add_message_version(output, "assistant", row_idx, is_current=True)
+        key = f"assistant_{row_idx}"
+        current_idx = output['metadata'][key]['current_version_index']
+        output['metadata'][key]['versions'][current_idx].update({
+            'content': output['internal'][row_idx][1],
+            'visible_content': output['visible'][row_idx][1]
+        })
 
     yield output
 

From 724147ffabce95b5d20528b83b6e44c1523d58f0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 10:49:29 -0700
Subject: [PATCH 17/59] Better detect when no model is available

---
 modules/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/utils.py b/modules/utils.py
index 0e8bdd18..577c55b8 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -74,7 +74,7 @@ def natural_keys(text):
 
 def check_model_loaded():
     if shared.model_name == 'None' or shared.model is None:
-        if len(get_available_models()) <= 1:
+        if len(get_available_models()) == 0:
             error_msg = "No model is loaded.\n\nTo get started:\n1) Place a GGUF file in your user_data/models folder\n2) Go to the Model tab and select it"
             logger.error(error_msg)
             return False, error_msg

From e7129f9dbefbe87fa4c425b5873f80cbddaf7cf0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 12:45:53 -0700
Subject: [PATCH 18/59] Prevent footer buttons below last assistant message
 from always appearing

---
 js/main.js | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/js/main.js b/js/main.js
index 48bb8632..ea3ff46a 100644
--- a/js/main.js
+++ b/js/main.js
@@ -171,7 +171,6 @@ const observer = new MutationObserver(function(mutations) {
     document.getElementById("Generate").style.display = "flex";
   }
 
-
   doSyntaxHighlighting();
 
   if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) {
@@ -184,7 +183,7 @@ const observer = new MutationObserver(function(mutations) {
     const lastChild = messagesContainer?.lastElementChild;
     const prevSibling = lastChild?.previousElementSibling;
     if (lastChild && prevSibling) {
-      lastChild.style.minHeight = `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px))`;
+      lastChild.style.setProperty("margin-bottom", `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px)`, "important");
     }
   }
 });

From aff41f3482bc7045334b0d81ac514723fdbd4f97 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 12:53:41 -0700
Subject: [PATCH 19/59] Update README

---
 README.md | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 361584f8..daf409d0 100644
--- a/README.md
+++ b/README.md
@@ -189,13 +189,13 @@ usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--
                  [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
                  [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit]
                  [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap]
-                 [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N]
+                 [--mlock] [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N]
                  [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
-                 [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner]
-                 [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
-                 [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
-                 [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--api] [--public-api]
-                 [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
+                 [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
+                 [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
+                 [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
+                 [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY]
+                 [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
 
 Text generation web UI
 
@@ -217,7 +217,7 @@ Basic settings:
   --idle-timeout IDLE_TIMEOUT               Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
 
 Model loader:
-  --loader LOADER                           Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ,
+  --loader LOADER                           Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2,
                                             TensorRT-LLM.
 
 Transformers/Accelerate:
@@ -248,16 +248,18 @@ llama.cpp:
   --batch-size BATCH_SIZE                   Maximum number of prompt tokens to batch together when calling llama_eval.
   --no-mmap                                 Prevent mmap from being used.
   --mlock                                   Force the system to keep the model in RAM.
-  --n-gpu-layers N_GPU_LAYERS               Number of layers to offload to the GPU.
+  --gpu-layers N, --n-gpu-layers N          Number of layers to offload to the GPU.
   --tensor-split TENSOR_SPLIT               Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
   --numa                                    Activate NUMA task allocation for llama.cpp.
   --no-kv-offload                           Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
   --row-split                               Split the model by rows across GPUs. This may improve multi-gpu performance.
-  --extra-flags EXTRA_FLAGS                 Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"
+  --extra-flags EXTRA_FLAGS                 Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"
   --streaming-llm                           Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
 
-Context and cache management:
+Context and cache:
   --ctx-size N, --n_ctx N, --max_seq_len N  Context size in tokens.
+  --cache-type N, --cache_type N            KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits
+                                            separately, e.g. q4_q8).
 
 Speculative decoding:
   --model-draft MODEL_DRAFT                 Path to the draft model for speculative decoding.
@@ -276,15 +278,9 @@ ExLlamaV2:
   --num_experts_per_token N                 Number of experts to use for generation. Applies to MoE models like Mixtral.
   --enable_tp                               Enable Tensor Parallelism (TP) in ExLlamaV2.
 
-HQQ:
-  --hqq-backend HQQ_BACKEND                 Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
-
 TensorRT-LLM:
   --cpp-runner                              Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
 
-Cache:
-  --cache_type CACHE_TYPE                   KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
-
 DeepSpeed:
   --deepspeed                               Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
   --nvme-offload-dir NVME_OFFLOAD_DIR       DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
@@ -307,6 +303,7 @@ Gradio:
   --ssl-certfile SSL_CERTFILE               The path to the SSL certificate cert file.
   --subpath SUBPATH                         Customize the subpath for gradio, use with reverse proxy
   --old-colors                              Use the legacy Gradio colors, before the December/2024 update.
+  --portable                                Hide features not available in portable mode like training.
 
 API:
   --api                                     Enable the API extension.

From f59998d2680f346038320b536617c4738c393947 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 13:08:48 -0700
Subject: [PATCH 20/59] Don't limit the number of prompt characters printed
 with --verbose

---
 modules/text_generation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 962311df..1fd6d810 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -505,11 +505,11 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
         return
 
 
-def print_prompt(prompt, max_chars=2000):
+def print_prompt(prompt, max_chars=-1):
     DARK_YELLOW = "\033[38;5;3m"
     RESET = "\033[0m"
 
-    if len(prompt) > max_chars:
+    if max_chars > 0 and len(prompt) > max_chars:
         half_chars = max_chars // 2
         hidden_len = len(prompt[half_chars:-half_chars])
         hidden_msg = f"{DARK_YELLOW}[...{hidden_len} characters hidden...]{RESET}"

From a45a65213052dad02d696ed54af1b9f2ea82cd4a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 13:28:51 -0700
Subject: [PATCH 21/59] CSS fix

---
 js/main.js | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/js/main.js b/js/main.js
index ea3ff46a..f23dc246 100644
--- a/js/main.js
+++ b/js/main.js
@@ -183,7 +183,10 @@ const observer = new MutationObserver(function(mutations) {
     const lastChild = messagesContainer?.lastElementChild;
     const prevSibling = lastChild?.previousElementSibling;
     if (lastChild && prevSibling) {
-      lastChild.style.setProperty("margin-bottom", `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px)`, "important");
+      lastChild.style.setProperty("margin-bottom",
+        `max(0px, calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px))`,
+        "important"
+      );
     }
   }
 });

From 8078c41ec67b96656d7e96128d915290b319e4f5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 13:32:19 -0700
Subject: [PATCH 22/59] Revert "Bump llama.cpp"

This reverts commit a8d02dec8f5e6a054a153b3b09425b51e090ae11.
---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 16 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 5f61aff9..0eaf10da 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index a718b6ca..65f184bf 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 5fddc623..d20b2ec3 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 8e014445..2613d787 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 77779f3d..af583b00 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -32,8 +32,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 79efc607..9bf2a37d 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -32,5 +32,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 8b29453e..1731448e 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -32,5 +32,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index f1f4a02e..fc481a1a 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index adf50d9a..fdae681d 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 46b36791..a58f39f7 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 66052711..91ea3a6d 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 4013abcc..37e5aa40 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 41808854..dcb2884b 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index cff79ec6..8f1295bb 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 762b3fa3..858b4488 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index b425d305..569bae99 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From dce02732a4caef16157ffbc288dfe079053e0bb4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 14:08:48 -0700
Subject: [PATCH 23/59] Fix timestamp issues when editing/swiping messages

---
 modules/chat.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 90d66687..6b3ff4fc 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1508,11 +1508,12 @@ def handle_edit_message_click(state):
     if "versions" not in history['metadata'][key] or not history['metadata'][key]["versions"]:
         original_content = history['internal'][message_index][role_idx]
         original_visible = history['visible'][message_index][role_idx]
+        original_timestamp = history['metadata'][key].get('timestamp', get_current_timestamp())
 
         history['metadata'][key]["versions"] = [{
             "content": original_content,
             "visible_content": original_visible,
-            "timestamp": get_current_timestamp()
+            "timestamp": original_timestamp
         }]
 
     history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True)
@@ -1564,6 +1565,7 @@ def handle_navigate_version_click(state):
     history['internal'][message_index][msg_content_idx] = version_to_load['content']
     history['visible'][message_index][msg_content_idx] = version_to_load['visible_content']
     metadata['current_version_index'] = new_idx
+    update_message_metadata(history['metadata'], role, message_index, timestamp=version_to_load['timestamp'])
 
     # Redraw and save
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

From acbcc12e7b19cc9f540d32b8d601ceefde77b7a1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 14:11:21 -0700
Subject: [PATCH 24/59] Clean up

---
 modules/chat.py    | 7 ++-----
 modules/ui_chat.py | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 6b3ff4fc..e526a9a0 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1493,7 +1493,7 @@ def handle_edit_message_click(state):
 
     if message_index >= len(history['internal']):
         html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
-        return [history, html_output, gr.update()]  # No unique_id change
+        return [history, html_output]
 
     role_idx = 0 if role == "user" else 1
 
@@ -1521,13 +1521,10 @@ def handle_edit_message_click(state):
 
     add_message_version(history, role, message_index, is_current=True)
 
-    # Since we are not branching, unique_id does not change.
-    past_chats_update = gr.update()
-
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
     html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
-    return [history, html_output, past_chats_update]
+    return [history, html_output]
 
 
 def handle_navigate_version_click(state):
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index df3d3929..d79aa523 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -314,7 +314,7 @@ def create_event_handlers():
 
     shared.gradio['edit_message'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
+        chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False)
 
     # Save/delete a character
     shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False)

From d1bfb08e8d4bab174e6b4467eff20f8a01a2a613 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 14:27:47 -0700
Subject: [PATCH 25/59] Improve the style of message editing

---
 css/main.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/css/main.css b/css/main.css
index 7f9d4618..9685c863 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1462,6 +1462,7 @@ strong {
 .editing-textarea {
     width: 100%;
     min-height: 200px;
+    max-height: 65vh;
     padding: 10px;
     border-radius: 5px;
     border: 1px solid #ccc;

From 28e6bd4fcd8cd385cc92cc56c0c49fc474006147 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 14:49:07 -0700
Subject: [PATCH 26/59] Revert "Update transformers requirement in
 /requirements/full (#7017)"

This reverts commit cc9b7253c1216e5340da85cba9b65a13cf3526e9.
---
 requirements/full/requirements.txt                 | 2 +-
 requirements/full/requirements_amd.txt             | 2 +-
 requirements/full/requirements_amd_noavx2.txt      | 2 +-
 requirements/full/requirements_apple_intel.txt     | 2 +-
 requirements/full/requirements_apple_silicon.txt   | 2 +-
 requirements/full/requirements_cpu_only.txt        | 2 +-
 requirements/full/requirements_cpu_only_noavx2.txt | 2 +-
 requirements/full/requirements_noavx2.txt          | 2 +-
 requirements/full/requirements_nowheels.txt        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 0eaf10da..2c322715 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -23,7 +23,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 65f184bf..6aeb325e 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index d20b2ec3..3b052423 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 2613d787..8c51459e 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index af583b00..b9f15d45 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 9bf2a37d..0877d968 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 1731448e..cab78237 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index fc481a1a..dfd42577 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -23,7 +23,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 2ed8affa..5d9f84ce 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 

From 7c29879e795776ceb742a8ddb47fd3843069cf34 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 30 May 2025 11:17:47 -0700
Subject: [PATCH 27/59] Fix 'Start reply with' (closes #7033)

---
 modules/chat.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index e526a9a0..881f7330 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -806,9 +806,12 @@ def remove_last_message(history):
     return html.unescape(last[0]), history
 
 
-def send_dummy_message(textbox, state):
+def send_dummy_message(text, state):
     history = state['history']
-    text = textbox['text']
+
+    # Handle both dict and string inputs
+    if isinstance(text, dict):
+        text = text['text']
 
     # Initialize metadata if not present
     if 'metadata' not in history:
@@ -822,9 +825,12 @@ def send_dummy_message(textbox, state):
     return history
 
 
-def send_dummy_reply(textbox, state):
+def send_dummy_reply(text, state):
     history = state['history']
-    text = textbox['text']
+
+    # Handle both dict and string inputs
+    if isinstance(text, dict):
+        text = text['text']
 
     # Initialize metadata if not present
     if 'metadata' not in history:

From 298d4719c6c9545a701a9cc9e8f4efceb108599a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 30 May 2025 11:32:24 -0700
Subject: [PATCH 28/59] Multiple small style improvements

---
 css/main.css  | 4 ++++
 modules/ui.py | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/css/main.css b/css/main.css
index 9685c863..967d94ed 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1551,3 +1551,7 @@ strong {
     color: var(--body-text-color-subdued);
     margin-top: 4px;
 }
+
+button:focus {
+    outline: none;
+}
diff --git a/modules/ui.py b/modules/ui.py
index a2662e14..9f4d67cb 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -71,6 +71,7 @@ if not shared.args.old_colors:
         block_background_fill_dark='transparent',
         block_border_color_dark='transparent',
         input_border_color_dark='var(--border-color-dark)',
+        input_border_color_focus_dark='var(--border-color-dark)',
         checkbox_border_color_dark='var(--border-color-dark)',
         border_color_primary_dark='var(--border-color-dark)',
         button_secondary_border_color_dark='var(--border-color-dark)',
@@ -89,6 +90,8 @@ if not shared.args.old_colors:
         checkbox_label_shadow='none',
         block_shadow='none',
         block_shadow_dark='none',
+        input_shadow_focus='none',
+        input_shadow_focus_dark='none',
         button_large_radius='0.375rem',
         button_large_padding='6px 12px',
         input_radius='0.375rem',

From 219f0a773166deeb0326c2874b29e66e382df524 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 30 May 2025 12:05:49 -0700
Subject: [PATCH 29/59] Fix exllamav3_hf models failing to unload (closes
 #7031)

---
 modules/exllamav3_hf.py | 17 +++++++++++++++++
 modules/models.py       |  3 +++
 2 files changed, 20 insertions(+)

diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index 417df473..1254ff5d 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -245,3 +245,20 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
         pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
 
         return Exllamav3HF(pretrained_model_name_or_path)
+
+    def unload(self):
+        """Properly unload the ExllamaV3 model and free GPU memory."""
+        if hasattr(self, 'ex_model') and self.ex_model is not None:
+            self.ex_model.unload()
+            self.ex_model = None
+
+        if hasattr(self, 'ex_cache') and self.ex_cache is not None:
+            self.ex_cache = None
+
+        # Clean up any additional ExllamaV3 resources
+        if hasattr(self, 'past_seq'):
+            self.past_seq = None
+        if hasattr(self, 'past_seq_negative'):
+            self.past_seq_negative = None
+        if hasattr(self, 'ex_cache_negative'):
+            self.ex_cache_negative = None
diff --git a/modules/models.py b/modules/models.py
index 4218d58c..d329ae3c 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -116,10 +116,13 @@ def unload_model(keep_model_name=False):
         return
 
     is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
+    if shared.args.loader == 'ExLlamav3_HF':
+        shared.model.unload()
 
     shared.model = shared.tokenizer = None
     shared.lora_names = []
     shared.model_dirty_from_training = False
+
     if not is_llamacpp:
         from modules.torch_utils import clear_torch_cache
         clear_torch_cache()

From 15f466ca3f8255f2566f016db8d7b8fd9ebef3f4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 30 May 2025 15:49:57 -0700
Subject: [PATCH 30/59] Update README

---
 README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index daf409d0..55df33d2 100644
--- a/README.md
+++ b/README.md
@@ -14,18 +14,18 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
 - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
-- **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents.
-- **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
-- Advanced chat management: Edit messages, navigate between message versions, and branch conversations at any point.
+- 100% offline and private, with zero telemetry, external resources, or remote update requests.
 - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
-- Automatic GPU layers for GGUF models (on NVIDIA GPUs).
-- UI that resembles the original ChatGPT style.
-- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
-- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
+- **File attachments**: Upload text files and PDF documents to talk about their contents.
+- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
+- Aesthetic UI with dark and light themes.
+- `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters.
+- Edit messages, navigate between message versions, and branch conversations at any point.
 - Multiple sampling parameters and generation options for sophisticated text generation control.
-- Switch between different models easily in the UI without restarting, with fine control over settings.
+- Switch between different models in the UI without restarting.
+- Automatic GPU layers for GGUF models (on NVIDIA GPUs).
+- Free-form text generation in the Default/Notebook tabs without being limited to chat turns.
 - OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
-- 100% offline and private, with zero telemetry, external resources, or remote update requests. Web search is optional and user-controlled.
 - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 
 ## How to install

From c55d3c61c6e44712e90fa60c1e434d7687e90947 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 31 May 2025 14:21:42 -0700
Subject: [PATCH 31/59] Bump exllamav2 to 0.3.1

---
 requirements/full/requirements.txt               | 6 +++---
 requirements/full/requirements_amd.txt           | 4 ++--
 requirements/full/requirements_amd_noavx2.txt    | 4 ++--
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_noavx2.txt        | 6 +++---
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 2c322715..dd631341 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -37,8 +37,8 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 6aeb325e..acdbd455 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -34,5 +34,5 @@ tiktoken
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 3b052423..a478d7d3 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -34,5 +34,5 @@ tiktoken
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 8c51459e..98ed90a2 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -35,4 +35,4 @@ tiktoken
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index b9f15d45..cb72d036 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -36,4 +36,4 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index dfd42577..f6982134 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -37,8 +37,8 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

From dc8ed6dbe769457b3a2758780abefab0ab04c8a4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 31 May 2025 14:27:33 -0700
Subject: [PATCH 32/59] Bump exllamav3 to 0.0.3

---
 requirements/full/requirements.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_noavx2.txt        | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index dd631341..ec055876 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -35,8 +35,8 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 98ed90a2..96a48f32 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -34,5 +34,5 @@ tiktoken
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index cb72d036..14b74081 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -35,5 +35,5 @@ tiktoken
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index f6982134..de507308 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -35,8 +35,8 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"

From 1d88456659d8e71800f6fb732b8cad7d36fa4c20 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 31 May 2025 20:15:07 -0700
Subject: [PATCH 33/59] Add support for .docx attachments

---
 README.md                                     |  2 +-
 modules/chat.py                               | 50 +++++++++++++++++++
 requirements/full/requirements.txt            |  1 +
 requirements/full/requirements_amd.txt        |  1 +
 requirements/full/requirements_amd_noavx2.txt |  1 +
 .../full/requirements_apple_intel.txt         |  1 +
 .../full/requirements_apple_silicon.txt       |  1 +
 requirements/full/requirements_cpu_only.txt   |  1 +
 .../full/requirements_cpu_only_noavx2.txt     |  1 +
 requirements/full/requirements_noavx2.txt     |  1 +
 requirements/full/requirements_nowheels.txt   |  1 +
 requirements/portable/requirements.txt        |  1 +
 .../portable/requirements_apple_intel.txt     |  1 +
 .../portable/requirements_apple_silicon.txt   |  1 +
 .../portable/requirements_cpu_only.txt        |  1 +
 .../portable/requirements_cpu_only_noavx2.txt |  1 +
 requirements/portable/requirements_noavx2.txt |  1 +
 .../portable/requirements_nowheels.txt        |  1 +
 requirements/portable/requirements_vulkan.txt |  1 +
 .../portable/requirements_vulkan_noavx2.txt   |  1 +
 20 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 55df33d2..16b02539 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
 - 100% offline and private, with zero telemetry, external resources, or remote update requests.
 - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
-- **File attachments**: Upload text files and PDF documents to talk about their contents.
+- **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
 - **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
 - Aesthetic UI with dark and light themes.
 - `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters.
diff --git a/modules/chat.py b/modules/chat.py
index 881f7330..ba61c7a9 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -500,6 +500,9 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
             # Process PDF file
             content = extract_pdf_text(path)
             file_type = "application/pdf"
+        elif file_extension == '.docx':
+            content = extract_docx_text(path)
+            file_type = "application/docx"
         else:
             # Default handling for text files
             with open(path, 'r', encoding='utf-8') as f:
@@ -538,6 +541,53 @@ def extract_pdf_text(pdf_path):
         return f"[Error extracting PDF text: {str(e)}]"
 
 
+def extract_docx_text(docx_path):
+    """
+    Extract text from a .docx file, including headers,
+    body (paragraphs and tables), and footers.
+    """
+    try:
+        import docx
+
+        doc = docx.Document(docx_path)
+        parts = []
+
+        # 1) Extract non-empty header paragraphs from each section
+        for section in doc.sections:
+            for para in section.header.paragraphs:
+                text = para.text.strip()
+                if text:
+                    parts.append(text)
+
+        # 2) Extract body blocks (paragraphs and tables) in document order
+        parent_elm = doc.element.body
+        for child in parent_elm.iterchildren():
+            if isinstance(child, docx.oxml.text.paragraph.CT_P):
+                para = docx.text.paragraph.Paragraph(child, doc)
+                text = para.text.strip()
+                if text:
+                    parts.append(text)
+
+            elif isinstance(child, docx.oxml.table.CT_Tbl):
+                table = docx.table.Table(child, doc)
+                for row in table.rows:
+                    cells = [cell.text.strip() for cell in row.cells]
+                    parts.append("\t".join(cells))
+
+        # 3) Extract non-empty footer paragraphs from each section
+        for section in doc.sections:
+            for para in section.footer.paragraphs:
+                text = para.text.strip()
+                if text:
+                    parts.append(text)
+
+        return "\n".join(parts)
+
+    except Exception as e:
+        logger.error(f"Error extracting text from DOCX: {e}")
+        return f"[Error extracting DOCX text: {str(e)}]"
+
+
 def generate_search_query(user_message, state):
     """Generate a search query from user message using the LLM"""
     # Augment the user message with search instruction
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index ec055876..e61677a6 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -16,6 +16,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index acdbd455..f807199d 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index a478d7d3..4fb70eb1 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 96a48f32..a311ab9b 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 14b74081..30e8409a 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 0877d968..70949949 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index cab78237..318bb93a 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index de507308..e0cb84b4 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -16,6 +16,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 5d9f84ce..a412367c 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index fdae681d..bde310e1 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index a58f39f7..521edc0c 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 91ea3a6d..ef7946ff 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 37e5aa40..a3ad743e 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index dcb2884b..eec052d3 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 8f1295bb..c9898a05 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 21805fe2..f6c866cf 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 858b4488..0de9c7cb 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 569bae99..2bfb4d51 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich

From 4a2727b71d8976366cc35e18048ad9742ccb1898 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 31 May 2025 20:24:31 -0700
Subject: [PATCH 34/59] Add a tooltip to the file upload button

---
 js/main.js | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/js/main.js b/js/main.js
index f23dc246..0fdd7ffd 100644
--- a/js/main.js
+++ b/js/main.js
@@ -872,3 +872,10 @@ function navigateLastAssistantMessage(direction) {
 
   return false;
 }
+
+//------------------------------------------------
+// Tooltips
+//------------------------------------------------
+
+// File upload button
+document.querySelector("#chat-input .upload-button").title = "Upload text files, PDFs, and DOCX documents";

From f8d220c1e6c0263e76797b0e34dc9ce20335875b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 31 May 2025 21:22:36 -0700
Subject: [PATCH 35/59] Add a tooltip to the web search checkbox

---
 js/main.js         | 3 +++
 modules/ui_chat.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/js/main.js b/js/main.js
index 0fdd7ffd..b9cb3cdd 100644
--- a/js/main.js
+++ b/js/main.js
@@ -879,3 +879,6 @@ function navigateLastAssistantMessage(direction) {
 
 // File upload button
 document.querySelector("#chat-input .upload-button").title = "Upload text files, PDFs, and DOCX documents";
+
+// Activate web search
+document.getElementById("web-search").title = "Search the internet with DuckDuckGo";
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index d79aa523..73528a92 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -87,7 +87,7 @@ def create_ui():
                     shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
 
                 with gr.Row():
-                    shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search')
+                    shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search')
 
                 with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']:
                     shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10)

From 85f2f01a3a78dc85bce9eeded71d9ff9f5bd4ab3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 31 May 2025 21:29:25 -0700
Subject: [PATCH 36/59] UI: Fix extra gaps on the right sidebar

---
 css/main.css | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/css/main.css b/css/main.css
index 967d94ed..bdaacd4f 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1555,3 +1555,8 @@ strong {
 button:focus {
     outline: none;
 }
+
+/* Fix extra gaps for hidden elements on the right sidebar */
+.svelte-sa48pu.stretch:has(> .hidden:only-child) {
+    display: none;
+}

From 98a7508a99f2c3bcb2139f7ef975b692f004c695 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 31 May 2025 22:18:17 -0700
Subject: [PATCH 37/59] UI: Move 'Show controls' inside the hover menu

---
 css/main.css       | 52 +++++++++++++++++++++++-----------------------
 js/main.js         | 40 ++++++++++++++---------------------
 modules/ui_chat.py | 25 ++++++++--------------
 3 files changed, 51 insertions(+), 66 deletions(-)

diff --git a/css/main.css b/css/main.css
index bdaacd4f..adc59fba 100644
--- a/css/main.css
+++ b/css/main.css
@@ -582,7 +582,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 #chat-input {
     padding: 0;
-    padding-top: 18px;
     background: transparent;
     border: none;
 }
@@ -661,31 +660,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     }
 }
 
-#show-controls {
-    position: absolute;
-    background-color: transparent;
-    border: 0 !important;
-    border-radius: 0;
-}
-
-#show-controls label {
-    z-index: 1000;
-    position: absolute;
-    right: 30px;
-    top: 10px;
-    white-space: nowrap;
-    overflow: hidden;
-    text-overflow: ellipsis;
-}
-
-.dark #show-controls span {
-    color: var(--neutral-400);
-}
-
-#show-controls span {
-    color: var(--neutral-600);
-}
-
 #typing-container {
     display: none;
     position: absolute;
@@ -785,6 +759,32 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     background: var(--selected-item-color-dark) !important;
 }
 
+#show-controls {
+    height: 36px;
+    border-top: 1px solid var(--border-color-dark) !important;
+    border-left: 1px solid var(--border-color-dark) !important;
+    border-right: 1px solid var(--border-color-dark) !important;
+    border-radius: 0;
+    border-bottom: 0 !important;
+    background-color: var(--darker-gray);
+    padding-top: 3px;
+    padding-left: 4px;
+    display: flex;
+}
+
+#show-controls label {
+    display: flex;
+    flex-direction: row-reverse;
+    font-weight: bold;
+    justify-content: space-between;
+    width: 100%;
+    padding-right: 12px;
+}
+
+#show-controls label input {
+    margin-top: 4px;
+}
+
 .transparent-substring {
     opacity: 0.333;
 }
diff --git a/js/main.js b/js/main.js
index b9cb3cdd..3652daa0 100644
--- a/js/main.js
+++ b/js/main.js
@@ -277,7 +277,7 @@ for (i = 0; i < slimDropdownElements.length; i++) {
 // The show/hide events were adapted from:
 // https://github.com/SillyTavern/SillyTavern/blob/6c8bd06308c69d51e2eb174541792a870a83d2d6/public/script.js
 //------------------------------------------------
-var buttonsInChat = document.querySelectorAll("#chat-tab #chat-buttons button");
+var buttonsInChat = document.querySelectorAll("#chat-tab #chat-buttons button, #chat-tab #chat-buttons #show-controls");
 var button = document.getElementById("hover-element-button");
 var menu = document.getElementById("hover-menu");
 var istouchscreen = (navigator.maxTouchPoints > 0) || "ontouchstart" in document.documentElement;
@@ -298,18 +298,21 @@ if (buttonsInChat.length > 0) {
     const thisButton = buttonsInChat[i];
     menu.appendChild(thisButton);
 
-    thisButton.addEventListener("click", () => {
-      hideMenu();
-    });
+    // Only apply transformations to button elements
+    if (thisButton.tagName.toLowerCase() === 'button') {
+      thisButton.addEventListener("click", () => {
+        hideMenu();
+      });
+      
+      const buttonText = thisButton.textContent;
+      const matches = buttonText.match(/(\(.*?\))/);
 
-    const buttonText = thisButton.textContent;
-    const matches = buttonText.match(/(\(.*?\))/);
-
-    if (matches && matches.length > 1) {
-      // Apply the transparent-substring class to the matched substring
-      const substring = matches[1];
-      const newText = buttonText.replace(substring, `&nbsp;<span class="transparent-substring">${substring.slice(1, -1)}</span>`);
-      thisButton.innerHTML = newText;
+      if (matches && matches.length > 1) {
+        // Apply the transparent-substring class to the matched substring
+        const substring = matches[1];
+        const newText = buttonText.replace(substring, `&nbsp;<span class="transparent-substring">${substring.slice(1, -1)}</span>`);
+        thisButton.innerHTML = newText;
+      }
     }
   }
 }
@@ -382,21 +385,10 @@ document.addEventListener("click", function (event) {
   }
 });
 
-//------------------------------------------------
-// Relocate the "Show controls" checkbox
-//------------------------------------------------
-var elementToMove = document.getElementById("show-controls");
-var parent = elementToMove.parentNode;
-for (var i = 0; i < 2; i++) {
-  parent = parent.parentNode;
-}
-
-parent.insertBefore(elementToMove, parent.firstChild);
-
 //------------------------------------------------
 // Position the chat input
 //------------------------------------------------
-document.getElementById("show-controls").parentNode.classList.add("chat-input-positioned");
+document.getElementById("chat-input-row").classList.add("chat-input-positioned");
 
 //------------------------------------------------
 // Focus on the chat input
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 73528a92..822b77b8 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -55,7 +55,6 @@ def create_ui():
 
                     with gr.Column(scale=10, elem_id='chat-input-container'):
                         shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar'])
-                        shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls')
                         shared.gradio['typing-dots'] = gr.HTML(value='<div class="typing"><span></span><span class="dot1"></span><span class="dot2"></span></div>', label='typing', elem_id='typing-container')
 
                     with gr.Column(scale=1, elem_id='generate-stop-container'):
@@ -65,21 +64,15 @@ def create_ui():
 
         # Hover menu buttons
         with gr.Column(elem_id='chat-buttons'):
-            with gr.Row():
-                shared.gradio['Regenerate'] = gr.Button('Regenerate (Ctrl + Enter)', elem_id='Regenerate')
-                shared.gradio['Continue'] = gr.Button('Continue (Alt + Enter)', elem_id='Continue')
-                shared.gradio['Remove last'] = gr.Button('Remove last reply (Ctrl + Shift + Backspace)', elem_id='Remove-last')
-
-            with gr.Row():
-                shared.gradio['Impersonate'] = gr.Button('Impersonate (Ctrl + Shift + M)', elem_id='Impersonate')
-
-            with gr.Row():
-                shared.gradio['Send dummy message'] = gr.Button('Send dummy message')
-                shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
-
-            with gr.Row():
-                shared.gradio['send-chat-to-default'] = gr.Button('Send to Default')
-                shared.gradio['send-chat-to-notebook'] = gr.Button('Send to Notebook')
+            shared.gradio['Regenerate'] = gr.Button('Regenerate (Ctrl + Enter)', elem_id='Regenerate')
+            shared.gradio['Continue'] = gr.Button('Continue (Alt + Enter)', elem_id='Continue')
+            shared.gradio['Remove last'] = gr.Button('Remove last reply (Ctrl + Shift + Backspace)', elem_id='Remove-last')
+            shared.gradio['Impersonate'] = gr.Button('Impersonate (Ctrl + Shift + M)', elem_id='Impersonate')
+            shared.gradio['Send dummy message'] = gr.Button('Send dummy message')
+            shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
+            shared.gradio['send-chat-to-default'] = gr.Button('Send to Default')
+            shared.gradio['send-chat-to-notebook'] = gr.Button('Send to Notebook')
+            shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls')
 
         with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']):
             with gr.Column():

From 0816ecedb75add2dd1a61c9bd9a477e5d847c88a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 31 May 2025 22:24:39 -0700
Subject: [PATCH 38/59] Lint

---
 js/main.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/js/main.js b/js/main.js
index 3652daa0..d152a572 100644
--- a/js/main.js
+++ b/js/main.js
@@ -299,11 +299,11 @@ if (buttonsInChat.length > 0) {
     menu.appendChild(thisButton);
 
     // Only apply transformations to button elements
-    if (thisButton.tagName.toLowerCase() === 'button') {
+    if (thisButton.tagName.toLowerCase() === "button") {
       thisButton.addEventListener("click", () => {
         hideMenu();
       });
-      
+
       const buttonText = thisButton.textContent;
       const matches = buttonText.match(/(\(.*?\))/);
 

From 9e801930087170bb24628e680ad4cbd4f6a5b098 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 31 May 2025 22:39:07 -0700
Subject: [PATCH 39/59] Add the model name to each message's metadata

---
 modules/chat.py           |  2 +-
 modules/html_generator.py | 47 ++++++++++++++++++++++++++-------------
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index ba61c7a9..1222d2bb 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -710,7 +710,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
     # Add timestamp for assistant's response at the start of generation
     row_idx = len(output['internal']) - 1
-    update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp())
+    update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp(), model_name=shared.model_name)
 
     # Generate
     reply = None
diff --git a/modules/html_generator.py b/modules/html_generator.py
index cbf3e19c..03b5d485 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -350,12 +350,14 @@ remove_button = f'<button class="footer-button footer-remove-button" title="Remo
 info_button = f'<button class="footer-button footer-info-button" title="message">{info_svg}</button>'
 
 
-def format_message_timestamp(history, role, index):
+def format_message_timestamp(history, role, index, tooltip_include_timestamp=True):
     """Get a formatted timestamp HTML span for a message if available"""
     key = f"{role}_{index}"
     if 'metadata' in history and key in history['metadata'] and history['metadata'][key].get('timestamp'):
         timestamp = history['metadata'][key]['timestamp']
-        return f"<span class='timestamp'>{timestamp}</span>"
+        tooltip_text = get_message_tooltip(history, role, index, include_timestamp=tooltip_include_timestamp)
+        title_attr = f' title="{html.escape(tooltip_text)}"' if tooltip_text else ''
+        return f"<span class='timestamp'{title_attr}>{timestamp}</span>"
 
     return ""
 
@@ -388,6 +390,23 @@ def format_message_attachments(history, role, index):
     return ""
 
 
+def get_message_tooltip(history, role, index, include_timestamp=True):
+    """Get tooltip text combining timestamp and model name for a message"""
+    key = f"{role}_{index}"
+    if 'metadata' not in history or key not in history['metadata']:
+        return ""
+
+    meta = history['metadata'][key]
+    tooltip_parts = []
+
+    if include_timestamp and meta.get('timestamp'):
+        tooltip_parts.append(meta['timestamp'])
+    if meta.get('model_name'):
+        tooltip_parts.append(f"Model: {meta['model_name']}")
+
+    return " | ".join(tooltip_parts)
+
+
 def get_version_navigation_html(history, i, role):
     """Generate simple navigation arrows for message versions"""
     key = f"{role}_{i}"
@@ -462,15 +481,13 @@ def generate_instruct_html(history):
         # Create info buttons for timestamps if they exist
         info_message_user = ""
         if user_timestamp != "":
-            # Extract the timestamp value from the span
-            user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0]
-            info_message_user = info_button.replace("message", user_timestamp_value)
+            tooltip_text = get_message_tooltip(history, "user", i)
+            info_message_user = info_button.replace('title="message"', f'title="{html.escape(tooltip_text)}"')
 
         info_message_assistant = ""
         if assistant_timestamp != "":
-            # Extract the timestamp value from the span
-            assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0]
-            info_message_assistant = info_button.replace("message", assistant_timestamp_value)
+            tooltip_text = get_message_tooltip(history, "assistant", i)
+            info_message_assistant = info_button.replace('title="message"', f'title="{html.escape(tooltip_text)}"')
 
         if converted_visible[0]:  # Don't display empty user messages
             output += (
@@ -521,8 +538,8 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
         converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
 
         # Get timestamps
-        user_timestamp = format_message_timestamp(history, "user", i)
-        assistant_timestamp = format_message_timestamp(history, "assistant", i)
+        user_timestamp = format_message_timestamp(history, "user", i, tooltip_include_timestamp=False)
+        assistant_timestamp = format_message_timestamp(history, "assistant", i, tooltip_include_timestamp=False)
 
         # Get attachments
         user_attachments = format_message_attachments(history, "user", i)
@@ -580,15 +597,13 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
         # Create info buttons for timestamps if they exist
         info_message_user = ""
         if user_timestamp != "":
-            # Extract the timestamp value from the span
-            user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0]
-            info_message_user = info_button.replace("message", user_timestamp_value)
+            tooltip_text = get_message_tooltip(history, "user", i)
+            info_message_user = info_button.replace('title="message"', f'title="{html.escape(tooltip_text)}"')
 
         info_message_assistant = ""
         if assistant_timestamp != "":
-            # Extract the timestamp value from the span
-            assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0]
-            info_message_assistant = info_button.replace("message", assistant_timestamp_value)
+            tooltip_text = get_message_tooltip(history, "assistant", i)
+            info_message_assistant = info_button.replace('title="message"', f'title="{html.escape(tooltip_text)}"')
 
         if converted_visible[0]:  # Don't display empty user messages
             output += (

From 88ff3e6ad8ddf96aabf6d7ceb4c228ed6fb08980 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 1 Jun 2025 08:00:37 -0700
Subject: [PATCH 40/59] CSS fixes after
 98a7508a99f2c3bcb2139f7ef975b692f004c695

---
 css/main.css | 2 +-
 js/main.js   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/css/main.css b/css/main.css
index adc59fba..0c6dc16e 100644
--- a/css/main.css
+++ b/css/main.css
@@ -665,7 +665,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     position: absolute;
     background-color: transparent;
     left: -2px;
-    top: 4px;
+    top: -14px;
     padding: var(--block-padding);
 }
 
diff --git a/js/main.js b/js/main.js
index d152a572..05c19571 100644
--- a/js/main.js
+++ b/js/main.js
@@ -184,7 +184,7 @@ const observer = new MutationObserver(function(mutations) {
     const prevSibling = lastChild?.previousElementSibling;
     if (lastChild && prevSibling) {
       lastChild.style.setProperty("margin-bottom",
-        `max(0px, calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px))`,
+        `max(0px, calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 84px) - ${lastChild.offsetHeight}px))`,
         "important"
       );
     }

From 3e3746283cd60409f83b6cf5549ba08d12612bde Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 1 Jun 2025 10:55:31 -0700
Subject: [PATCH 41/59] Improve the typing dots position

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 0c6dc16e..296476cd 100644
--- a/css/main.css
+++ b/css/main.css
@@ -665,7 +665,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     position: absolute;
     background-color: transparent;
     left: -2px;
-    top: -14px;
+    top: -5px;
     padding: var(--block-padding);
 }
 

From 83849336d8efcae0340b768a39c83106ee406264 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 1 Jun 2025 10:58:28 -0700
Subject: [PATCH 42/59] Improve how Show controls looks in the hover menu

---
 css/main.css | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 296476cd..71d67ff4 100644
--- a/css/main.css
+++ b/css/main.css
@@ -776,9 +776,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     display: flex;
     flex-direction: row-reverse;
     font-weight: bold;
-    justify-content: space-between;
+    justify-content: start;
     width: 100%;
     padding-right: 12px;
+    gap: 10px;
 }
 
 #show-controls label input {

From bf42b2c3a1175266dcc7c481f589d53805d956f3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 1 Jun 2025 11:02:04 -0700
Subject: [PATCH 43/59] Fix thinking blocks sometimes showing a white outline

---
 css/main.css | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/css/main.css b/css/main.css
index 71d67ff4..a9cb36ab 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1327,6 +1327,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     overflow: hidden;
 }
 
+.thinking-content:focus, .thinking-header:focus {
+    outline: 0 !important;
+}
+
 .dark .thinking-block {
     background-color: var(--darker-gray);
 }

From 7a81beb0c16ff51a90fbe77e6300076714af1fd0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 1 Jun 2025 18:23:23 -0700
Subject: [PATCH 44/59] Turn long pasted text into an attachment automatically

---
 js/main.js | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/js/main.js b/js/main.js
index 05c19571..8090937f 100644
--- a/js/main.js
+++ b/js/main.js
@@ -865,6 +865,46 @@ function navigateLastAssistantMessage(direction) {
   return false;
 }
 
+//------------------------------------------------
+// Paste Handler for Long Text
+//------------------------------------------------
+
+const MAX_PLAIN_TEXT_LENGTH = 2500;
+
+function setupPasteHandler() {
+  const textbox = document.querySelector("#chat-input textarea[data-testid=\"textbox\"]");
+  const fileInput = document.querySelector("#chat-input input[data-testid=\"file-upload\"]");
+
+  if (!textbox || !fileInput) {
+    setTimeout(setupPasteHandler, 500);
+    return;
+  }
+
+  textbox.addEventListener("paste", async (event) => {
+    const text = event.clipboardData?.getData("text");
+
+    if (text && text.length > MAX_PLAIN_TEXT_LENGTH) {
+      event.preventDefault();
+
+      const file = new File([text], "pasted_text.txt", {
+        type: "text/plain",
+        lastModified: Date.now()
+      });
+
+      const dataTransfer = new DataTransfer();
+      dataTransfer.items.add(file);
+      fileInput.files = dataTransfer.files;
+      fileInput.dispatchEvent(new Event("change", { bubbles: true }));
+    }
+  });
+}
+
+if (document.readyState === "loading") {
+  document.addEventListener("DOMContentLoaded", setupPasteHandler);
+} else {
+  setupPasteHandler();
+}
+
 //------------------------------------------------
 // Tooltips
 //------------------------------------------------

From 92adceb7b57464ef03886cba5324a32e7d8f8b67 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 1 Jun 2025 19:22:21 -0700
Subject: [PATCH 45/59] UI: Fix the model downloader progress bar

---
 download-model.py        |  52 ++++++++++++------
 modules/ui_model_menu.py | 115 +++++++++++++++++++++++++++------------
 2 files changed, 115 insertions(+), 52 deletions(-)

diff --git a/download-model.py b/download-model.py
index 25517491..576a8b79 100644
--- a/download-model.py
+++ b/download-model.py
@@ -32,6 +32,7 @@ class ModelDownloader:
         self.max_retries = max_retries
         self.session = self.get_session()
         self._progress_bar_slots = None
+        self.progress_queue = None
 
     def get_session(self):
         session = requests.Session()
@@ -218,33 +219,45 @@ class ModelDownloader:
 
         max_retries = self.max_retries
         attempt = 0
+        file_downloaded_count_for_progress = 0
+
         try:
             while attempt < max_retries:
                 attempt += 1
                 session = self.session
                 headers = {}
                 mode = 'wb'
+                current_file_size_on_disk = 0
 
                 try:
                     if output_path.exists() and not start_from_scratch:
-                        # Resume download
-                        r = session.get(url, stream=True, timeout=20)
-                        total_size = int(r.headers.get('content-length', 0))
-                        if output_path.stat().st_size >= total_size:
+                        current_file_size_on_disk = output_path.stat().st_size
+                        r_head = session.head(url, timeout=20)
+                        r_head.raise_for_status()
+                        total_size = int(r_head.headers.get('content-length', 0))
+
+                        if current_file_size_on_disk >= total_size and total_size > 0:
+                            if self.progress_queue is not None and total_size > 0:
+                                self.progress_queue.put((1.0, str(filename)))
                             return
 
-                        headers = {'Range': f'bytes={output_path.stat().st_size}-'}
+                        headers = {'Range': f'bytes={current_file_size_on_disk}-'}
                         mode = 'ab'
 
                     with session.get(url, stream=True, headers=headers, timeout=30) as r:
-                        r.raise_for_status()  # If status is not 2xx, raise an error
-                        total_size = int(r.headers.get('content-length', 0))
-                        block_size = 1024 * 1024  # 1MB
+                        r.raise_for_status()
+                        total_size_from_stream = int(r.headers.get('content-length', 0))
+                        if mode == 'ab':
+                            effective_total_size = current_file_size_on_disk + total_size_from_stream
+                        else:
+                            effective_total_size = total_size_from_stream
 
-                        filename_str = str(filename)  # Convert PosixPath to string if necessary
+                        block_size = 1024 * 1024
+                        filename_str = str(filename)
 
                         tqdm_kwargs = {
-                            'total': total_size,
+                            'total': effective_total_size,
+                            'initial': current_file_size_on_disk if mode == 'ab' else 0,
                             'unit': 'B',
                             'unit_scale': True,
                             'unit_divisor': 1024,
@@ -261,16 +274,20 @@ class ModelDownloader:
                             })
 
                         with open(output_path, mode) as f:
+                            if mode == 'ab':
+                                f.seek(current_file_size_on_disk)
+
                             with tqdm.tqdm(**tqdm_kwargs) as t:
-                                count = 0
+                                file_downloaded_count_for_progress = current_file_size_on_disk
                                 for data in r.iter_content(block_size):
                                     f.write(data)
                                     t.update(len(data))
-                                    if total_size != 0 and self.progress_bar is not None:
-                                        count += len(data)
-                                        self.progress_bar(float(count) / float(total_size), f"{filename_str}")
+                                    if effective_total_size != 0 and self.progress_queue is not None:
+                                        file_downloaded_count_for_progress += len(data)
+                                        progress_fraction = float(file_downloaded_count_for_progress) / float(effective_total_size)
+                                        self.progress_queue.put((progress_fraction, filename_str))
+                        break
 
-                        break  # Exit loop if successful
                 except (RequestException, ConnectionError, Timeout) as e:
                     print(f"Error downloading {filename}: {e}.")
                     print(f"That was attempt {attempt}/{max_retries}.", end=' ')
@@ -295,10 +312,9 @@ class ModelDownloader:
         finally:
             print(f"\nDownload of {len(file_list)} files to {output_folder} completed.")
 
-    def download_model_files(self, model, branch, links, sha256, output_folder, progress_bar=None, start_from_scratch=False, threads=4, specific_file=None, is_llamacpp=False):
-        self.progress_bar = progress_bar
+    def download_model_files(self, model, branch, links, sha256, output_folder, progress_queue=None, start_from_scratch=False, threads=4, specific_file=None, is_llamacpp=False):
+        self.progress_queue = progress_queue
 
-        # Create the folder and writing the metadata
         output_folder.mkdir(parents=True, exist_ok=True)
 
         if not is_llamacpp:
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 862b3893..2a7d3d9d 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -1,4 +1,6 @@
 import importlib
+import queue
+import threading
 import traceback
 from functools import partial
 from pathlib import Path
@@ -205,48 +207,51 @@ def load_lora_wrapper(selected_loras):
 
 
 def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):
+    downloader_module = importlib.import_module("download-model")
+    downloader = downloader_module.ModelDownloader()
+    update_queue = queue.Queue()
+
     try:
         # Handle direct GGUF URLs
         if repo_id.startswith("https://") and ("huggingface.co" in repo_id) and (repo_id.endswith(".gguf") or repo_id.endswith(".gguf?download=true")):
             try:
                 path = repo_id.split("huggingface.co/")[1]
-
-                # Extract the repository ID (first two parts of the path)
                 parts = path.split("/")
                 if len(parts) >= 2:
                     extracted_repo_id = f"{parts[0]}/{parts[1]}"
-
-                    # Extract the filename (last part of the path)
-                    filename = repo_id.split("/")[-1]
-                    if "?download=true" in filename:
-                        filename = filename.replace("?download=true", "")
-
+                    filename = repo_id.split("/")[-1].replace("?download=true", "")
                     repo_id = extracted_repo_id
                     specific_file = filename
-            except:
-                pass
+            except Exception as e:
+                yield f"Error parsing GGUF URL: {e}"
+                progress(0.0)
+                return
 
-        if repo_id == "":
-            yield ("Please enter a model path")
+        if not repo_id:
+            yield "Please enter a model path."
+            progress(0.0)
             return
 
         repo_id = repo_id.strip()
         specific_file = specific_file.strip()
-        downloader = importlib.import_module("download-model").ModelDownloader()
 
-        progress(0.0)
+        progress(0.0, "Preparing download...")
+
         model, branch = downloader.sanitize_model_and_branch_names(repo_id, None)
-
-        yield ("Getting the download links from Hugging Face")
+        yield "Getting download links from Hugging Face..."
         links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file)
 
+        if not links:
+            yield "No files found to download for the given model/criteria."
+            progress(0.0)
+            return
+
         # Check for multiple GGUF files
         gguf_files = [link for link in links if link.lower().endswith('.gguf')]
         if len(gguf_files) > 1 and not specific_file:
             output = "Multiple GGUF files found. Please copy one of the following filenames to the 'File name' field:\n\n```\n"
             for link in gguf_files:
                 output += f"{Path(link).name}\n"
-
             output += "```"
             yield output
             return
@@ -255,17 +260,13 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
             output = "```\n"
             for link in links:
                 output += f"{Path(link).name}" + "\n"
-
             output += "```"
             yield output
             return
 
-        yield ("Getting the output folder")
+        yield "Determining output folder..."
         output_folder = downloader.get_output_folder(
-            model,
-            branch,
-            is_lora,
-            is_llamacpp=is_llamacpp,
+            model, branch, is_lora, is_llamacpp=is_llamacpp,
             model_dir=shared.args.model_dir if shared.args.model_dir != shared.args_defaults.model_dir else None
         )
 
@@ -275,19 +276,65 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
             output_folder = Path(shared.args.lora_dir)
 
         if check:
-            progress(0.5)
-
-            yield ("Checking previously downloaded files")
+            yield "Checking previously downloaded files..."
+            progress(0.5, "Verifying files...")
             downloader.check_model_files(model, branch, links, sha256, output_folder)
-            progress(1.0)
-        else:
-            yield (f"Downloading file{'s' if len(links) > 1 else ''} to `{output_folder}/`")
-            downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=4, is_llamacpp=is_llamacpp)
+            progress(1.0, "Verification complete.")
+            yield "File check complete."
+            return
 
-            yield (f"Model successfully saved to `{output_folder}/`.")
-    except:
-        progress(1.0)
-        yield traceback.format_exc().replace('\n', '\n\n')
+        yield ""
+        progress(0.0, "Download starting...")
+
+        def downloader_thread_target():
+            try:
+                downloader.download_model_files(
+                    model, branch, links, sha256, output_folder,
+                    progress_queue=update_queue,
+                    threads=4,
+                    is_llamacpp=is_llamacpp,
+                    specific_file=specific_file
+                )
+                update_queue.put(("COMPLETED", f"Model successfully saved to `{output_folder}/`."))
+            except Exception as e:
+                tb_str = traceback.format_exc().replace('\n', '\n\n')
+                update_queue.put(("ERROR", tb_str))
+
+        download_thread = threading.Thread(target=downloader_thread_target)
+        download_thread.start()
+
+        while True:
+            try:
+                message = update_queue.get(timeout=0.2)
+                if not isinstance(message, tuple) or len(message) != 2:
+                    continue
+
+                msg_identifier, data = message
+
+                if msg_identifier == "COMPLETED":
+                    progress(1.0, "Download complete!")
+                    yield data
+                    break
+                elif msg_identifier == "ERROR":
+                    progress(0.0, "Error occurred")
+                    yield data
+                    break
+                elif isinstance(msg_identifier, float):
+                    progress_value = msg_identifier
+                    description_str = data
+                    progress(progress_value, f"Downloading: {description_str}")
+
+            except queue.Empty:
+                if not download_thread.is_alive():
+                    yield "Download process finished."
+                    break
+
+        download_thread.join()
+
+    except Exception as e:
+        progress(0.0)
+        tb_str = traceback.format_exc().replace('\n', '\n\n')
+        yield tb_str
 
 
 def update_truncation_length(current_length, state):

From ad6d0218ae0c015694bef7a43f5f628d281a1c36 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 1 Jun 2025 19:27:14 -0700
Subject: [PATCH 46/59] Fix after 219f0a773166deeb0326c2874b29e66e382df524

---
 modules/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/models.py b/modules/models.py
index d329ae3c..c1e7fb56 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -116,7 +116,7 @@ def unload_model(keep_model_name=False):
         return
 
     is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
-    if shared.args.loader == 'ExLlamav3_HF':
+    if shared.model.__class__.__name__ == 'Exllamav3HF':
         shared.model.unload()
 
     shared.model = shared.tokenizer = None

From 2db7745cbde543d7e1abd81c0389c544c84621db Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 1 Jun 2025 22:12:24 -0700
Subject: [PATCH 47/59] Show llama.cpp prompt processing on one line instead of
 many lines

---
 modules/llama_cpp_server.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index d695c74e..aa712541 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -409,14 +409,31 @@ class LlamaServer:
 
 def filter_stderr_with_progress(process_stderr):
     progress_pattern = re.compile(r'slot update_slots: id.*progress = (\d+\.\d+)')
+    last_was_progress = False
+
     try:
         for line in iter(process_stderr.readline, ''):
+            line = line.rstrip('\n\r')  # Remove existing newlines
             progress_match = progress_pattern.search(line)
+
             if progress_match:
-                sys.stderr.write(line)
+                if last_was_progress:
+                    # Overwrite the previous progress line using carriage return
+                    sys.stderr.write(f'\r{line}')
+                else:
+                    # First progress line - print normally
+                    sys.stderr.write(line)
                 sys.stderr.flush()
+                last_was_progress = True
             elif not line.startswith(('srv ', 'slot ')) and 'log_server_r: request: GET /health' not in line:
-                sys.stderr.write(line)
+                if last_was_progress:
+                    # Finish the progress line with a newline, then print the new line
+                    sys.stderr.write(f'\n{line}\n')
+                else:
+                    # Normal line - print with newline
+                    sys.stderr.write(f'{line}\n')
                 sys.stderr.flush()
+                last_was_progress = False
+            # For filtered lines, don't change last_was_progress state
     except (ValueError, IOError):
         pass

From 45c9ae312c1ff60ce13c721d1290b65f01bf9660 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 1 Jun 2025 22:17:22 -0700
Subject: [PATCH 48/59] Use the flash-attention wheels in
 https://github.com/kingbri1/flash-attention

---
 requirements/full/requirements.txt        | 2 +-
 requirements/full/requirements_noavx2.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index e61677a6..04d97220 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -41,5 +41,5 @@ https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index e0cb84b4..7c3635cc 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -41,5 +41,5 @@ https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

From bb409c926e986e57b8c3eea3582abb466f32ad08 Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Mon, 2 Jun 2025 09:50:17 -0300
Subject: [PATCH 49/59] Update only the last message during streaming + add
 back dynamic UI update speed (#7038)

---
 js/global_scope_js.js            |  25 ++-
 modules/chat.py                  |   4 +-
 modules/html_generator.py        | 274 ++++++++++++++++---------------
 modules/shared.py                |   3 +-
 modules/text_generation.py       |  18 +-
 modules/ui.py                    |   6 +-
 modules/ui_chat.py               |   4 +-
 modules/ui_parameters.py         |   2 -
 user_data/settings-template.yaml |   1 -
 9 files changed, 181 insertions(+), 156 deletions(-)

diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 3274f47e..d5140c93 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -229,10 +229,23 @@ function removeLastClick() {
   document.getElementById("Remove-last").click();
 }
 
-function handleMorphdomUpdate(text) {
+function handleMorphdomUpdate(data) {
+  // Determine target element and use it as query scope
+  var target_element, target_html;
+  if (data.last_message_only) {
+    const childNodes = document.getElementsByClassName("messages")[0].childNodes;
+    target_element = childNodes[childNodes.length - 1];
+    target_html = data.html;
+  } else {
+    target_element = document.getElementById("chat").parentNode;
+    target_html =  "<div class=\"prose svelte-1ybaih5\">" + data.html + "</div>";
+  }
+
+  const queryScope = target_element;
+
   // Track open blocks
   const openBlocks = new Set();
-  document.querySelectorAll(".thinking-block").forEach(block => {
+  queryScope.querySelectorAll(".thinking-block").forEach(block => {
     const blockId = block.getAttribute("data-block-id");
     // If block exists and is open, add to open set
     if (blockId && block.hasAttribute("open")) {
@@ -242,7 +255,7 @@ function handleMorphdomUpdate(text) {
 
   // Store scroll positions for any open blocks
   const scrollPositions = {};
-  document.querySelectorAll(".thinking-block[open]").forEach(block => {
+  queryScope.querySelectorAll(".thinking-block[open]").forEach(block => {
     const content = block.querySelector(".thinking-content");
     const blockId = block.getAttribute("data-block-id");
     if (content && blockId) {
@@ -255,8 +268,8 @@ function handleMorphdomUpdate(text) {
   });
 
   morphdom(
-    document.getElementById("chat").parentNode,
-    "<div class=\"prose svelte-1ybaih5\">" + text + "</div>",
+    target_element,
+    target_html,
     {
       onBeforeElUpdated: function(fromEl, toEl) {
         // Preserve code highlighting
@@ -307,7 +320,7 @@ function handleMorphdomUpdate(text) {
   );
 
   // Add toggle listeners for new blocks
-  document.querySelectorAll(".thinking-block").forEach(block => {
+  queryScope.querySelectorAll(".thinking-block").forEach(block => {
     if (!block._hasToggleListener) {
       block.addEventListener("toggle", function(e) {
         if (this.open) {
diff --git a/modules/chat.py b/modules/chat.py
index 1222d2bb..f1ea16f1 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -825,7 +825,9 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
     last_save_time = time.monotonic()
     save_interval = 8
     for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)):
-        yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
+        yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'], last_message_only=(i > 0)), history
+        if i == 0:
+            time.sleep(0.125)  # We need this to make sure the first update goes through
 
         current_time = time.monotonic()
         # Save on first iteration or if save_interval seconds have passed
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 03b5d485..f90e3b04 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -462,64 +462,69 @@ def actions_html(history, i, role, info_message=""):
             f'{version_nav_html}')
 
 
-def generate_instruct_html(history):
-    output = f'<style>{instruct_css}</style><div class="chat" id="chat" data-mode="instruct"><div class="messages">'
+def generate_instruct_html(history, last_message_only=False):
+    if not last_message_only:
+        output = f'<style>{instruct_css}</style><div class="chat" id="chat" data-mode="instruct"><div class="messages">'
+    else:
+        output = ""
 
-    for i in range(len(history['visible'])):
-        row_visible = history['visible'][i]
-        row_internal = history['internal'][i]
-        converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
+    def create_message(role, content, raw_content):
+        """Inner function that captures variables from outer scope."""
+        class_name = "user-message" if role == "user" else "assistant-message"
 
-        # Get timestamps
-        user_timestamp = format_message_timestamp(history, "user", i)
-        assistant_timestamp = format_message_timestamp(history, "assistant", i)
+        # Get role-specific data
+        timestamp = format_message_timestamp(history, role, i)
+        attachments = format_message_attachments(history, role, i)
 
-        # Get attachments
-        user_attachments = format_message_attachments(history, "user", i)
-        assistant_attachments = format_message_attachments(history, "assistant", i)
+        # Create info button if timestamp exists
+        info_message = ""
+        if timestamp:
+            tooltip_text = get_message_tooltip(history, role, i)
+            info_message = info_button.replace('title="message"', f'title="{html.escape(tooltip_text)}"')
 
-        # Create info buttons for timestamps if they exist
-        info_message_user = ""
-        if user_timestamp != "":
-            tooltip_text = get_message_tooltip(history, "user", i)
-            info_message_user = info_button.replace('title="message"', f'title="{html.escape(tooltip_text)}"')
-
-        info_message_assistant = ""
-        if assistant_timestamp != "":
-            tooltip_text = get_message_tooltip(history, "assistant", i)
-            info_message_assistant = info_button.replace('title="message"', f'title="{html.escape(tooltip_text)}"')
-
-        if converted_visible[0]:  # Don't display empty user messages
-            output += (
-                f'<div class="user-message" '
-                f'data-raw="{html.escape(row_internal[0], quote=True)}"'
-                f'data-index={i}>'
-                f'<div class="text">'
-                f'<div class="message-body">{converted_visible[0]}</div>'
-                f'{user_attachments}'
-                f'{actions_html(history, i, "user", info_message_user)}'
-                f'</div>'
-                f'</div>'
-            )
-
-        output += (
-            f'<div class="assistant-message" '
-            f'data-raw="{html.escape(row_internal[1], quote=True)}"'
+        return (
+            f'<div class="{class_name}" '
+            f'data-raw="{html.escape(raw_content, quote=True)}"'
             f'data-index={i}>'
             f'<div class="text">'
-            f'<div class="message-body">{converted_visible[1]}</div>'
-            f'{assistant_attachments}'
-            f'{actions_html(history, i, "assistant", info_message_assistant)}'
+            f'<div class="message-body">{content}</div>'
+            f'{attachments}'
+            f'{actions_html(history, i, role, info_message)}'
             f'</div>'
             f'</div>'
         )
 
-    output += "</div></div>"
+    # Determine range
+    start_idx = len(history['visible']) - 1 if last_message_only else 0
+    end_idx = len(history['visible'])
+
+    for i in range(start_idx, end_idx):
+        row_visible = history['visible'][i]
+        row_internal = history['internal'][i]
+
+        # Convert content
+        if last_message_only:
+            converted_visible = [None, convert_to_markdown_wrapped(row_visible[1], message_id=i, use_cache=i != len(history['visible']) - 1)]
+        else:
+            converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
+
+        # Generate messages
+        if not last_message_only and converted_visible[0]:
+            output += create_message("user", converted_visible[0], row_internal[0])
+
+        output += create_message("assistant", converted_visible[1], row_internal[1])
+
+    if not last_message_only:
+        output += "</div></div>"
+
     return output
 
 
-def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=False):
-    output = f'<style>{chat_styles[style]}</style><div class="chat" id="chat"><div class="messages">'
+def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=False, last_message_only=False):
+    if not last_message_only:
+        output = f'<style>{chat_styles[style]}</style><div class="chat" id="chat"><div class="messages">'
+    else:
+        output = ""
 
     # We use ?character and ?time.time() to force the browser to reset caches
     img_bot = (
@@ -527,110 +532,117 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
         if Path("user_data/cache/pfp_character_thumb.png").exists() else ''
     )
 
-    img_me = (
-        f'<img src="file/user_data/cache/pfp_me.png?{time.time() if reset_cache else ""}">'
-        if Path("user_data/cache/pfp_me.png").exists() else ''
-    )
+    def create_message(role, content, raw_content):
+        """Inner function for CAI-style messages."""
+        circle_class = "circle-you" if role == "user" else "circle-bot"
+        name = name1 if role == "user" else name2
 
-    for i in range(len(history['visible'])):
-        row_visible = history['visible'][i]
-        row_internal = history['internal'][i]
-        converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
+        # Get role-specific data
+        timestamp = format_message_timestamp(history, role, i, tooltip_include_timestamp=False)
+        attachments = format_message_attachments(history, role, i)
 
-        # Get timestamps
-        user_timestamp = format_message_timestamp(history, "user", i, tooltip_include_timestamp=False)
-        assistant_timestamp = format_message_timestamp(history, "assistant", i, tooltip_include_timestamp=False)
+        # Get appropriate image
+        if role == "user":
+            img = (f'<img src="file/user_data/cache/pfp_me.png?{time.time() if reset_cache else ""}">'
+                   if Path("user_data/cache/pfp_me.png").exists() else '')
+        else:
+            img = img_bot
 
-        # Get attachments
-        user_attachments = format_message_attachments(history, "user", i)
-        assistant_attachments = format_message_attachments(history, "assistant", i)
-
-        if converted_visible[0]:  # Don't display empty user messages
-            output += (
-                f'<div class="message" '
-                f'data-raw="{html.escape(row_internal[0], quote=True)}"'
-                f'data-index={i}>'
-                f'<div class="circle-you">{img_me}</div>'
-                f'<div class="text">'
-                f'<div class="username">{name1}{user_timestamp}</div>'
-                f'<div class="message-body">{converted_visible[0]}</div>'
-                f'{user_attachments}'
-                f'{actions_html(history, i, "user")}'
-                f'</div>'
-                f'</div>'
-            )
-
-        output += (
+        return (
             f'<div class="message" '
-            f'data-raw="{html.escape(row_internal[1], quote=True)}"'
+            f'data-raw="{html.escape(raw_content, quote=True)}"'
             f'data-index={i}>'
-            f'<div class="circle-bot">{img_bot}</div>'
+            f'<div class="{circle_class}">{img}</div>'
             f'<div class="text">'
-            f'<div class="username">{name2}{assistant_timestamp}</div>'
-            f'<div class="message-body">{converted_visible[1]}</div>'
-            f'{assistant_attachments}'
-            f'{actions_html(history, i, "assistant")}'
+            f'<div class="username">{name}{timestamp}</div>'
+            f'<div class="message-body">{content}</div>'
+            f'{attachments}'
+            f'{actions_html(history, i, role)}'
             f'</div>'
             f'</div>'
         )
 
-    output += "</div></div>"
+    # Determine range
+    start_idx = len(history['visible']) - 1 if last_message_only else 0
+    end_idx = len(history['visible'])
+
+    for i in range(start_idx, end_idx):
+        row_visible = history['visible'][i]
+        row_internal = history['internal'][i]
+
+        # Convert content
+        if last_message_only:
+            converted_visible = [None, convert_to_markdown_wrapped(row_visible[1], message_id=i, use_cache=i != len(history['visible']) - 1)]
+        else:
+            converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
+
+        # Generate messages
+        if not last_message_only and converted_visible[0]:
+            output += create_message("user", converted_visible[0], row_internal[0])
+
+        output += create_message("assistant", converted_visible[1], row_internal[1])
+
+    if not last_message_only:
+        output += "</div></div>"
+
     return output
 
 
-def generate_chat_html(history, name1, name2, reset_cache=False):
-    output = f'<style>{chat_styles["wpp"]}</style><div class="chat" id="chat"><div class="messages">'
+def generate_chat_html(history, name1, name2, reset_cache=False, last_message_only=False):
+    if not last_message_only:
+        output = f'<style>{chat_styles["wpp"]}</style><div class="chat" id="chat"><div class="messages">'
+    else:
+        output = ""
 
-    for i in range(len(history['visible'])):
-        row_visible = history['visible'][i]
-        row_internal = history['internal'][i]
-        converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
+    def create_message(role, content, raw_content):
+        """Inner function for WPP-style messages."""
+        text_class = "text-you" if role == "user" else "text-bot"
 
-        # Get timestamps
-        user_timestamp = format_message_timestamp(history, "user", i)
-        assistant_timestamp = format_message_timestamp(history, "assistant", i)
+        # Get role-specific data
+        timestamp = format_message_timestamp(history, role, i)
+        attachments = format_message_attachments(history, role, i)
 
-        # Get attachments
-        user_attachments = format_message_attachments(history, "user", i)
-        assistant_attachments = format_message_attachments(history, "assistant", i)
+        # Create info button if timestamp exists
+        info_message = ""
+        if timestamp:
+            tooltip_text = get_message_tooltip(history, role, i)
+            info_message = info_button.replace('title="message"', f'title="{html.escape(tooltip_text)}"')
 
-        # Create info buttons for timestamps if they exist
-        info_message_user = ""
-        if user_timestamp != "":
-            tooltip_text = get_message_tooltip(history, "user", i)
-            info_message_user = info_button.replace('title="message"', f'title="{html.escape(tooltip_text)}"')
-
-        info_message_assistant = ""
-        if assistant_timestamp != "":
-            tooltip_text = get_message_tooltip(history, "assistant", i)
-            info_message_assistant = info_button.replace('title="message"', f'title="{html.escape(tooltip_text)}"')
-
-        if converted_visible[0]:  # Don't display empty user messages
-            output += (
-                f'<div class="message" '
-                f'data-raw="{html.escape(row_internal[0], quote=True)}"'
-                f'data-index={i}>'
-                f'<div class="text-you">'
-                f'<div class="message-body">{converted_visible[0]}</div>'
-                f'{user_attachments}'
-                f'{actions_html(history, i, "user", info_message_user)}'
-                f'</div>'
-                f'</div>'
-            )
-
-        output += (
+        return (
             f'<div class="message" '
-            f'data-raw="{html.escape(row_internal[1], quote=True)}"'
+            f'data-raw="{html.escape(raw_content, quote=True)}"'
             f'data-index={i}>'
-            f'<div class="text-bot">'
-            f'<div class="message-body">{converted_visible[1]}</div>'
-            f'{assistant_attachments}'
-            f'{actions_html(history, i, "assistant", info_message_assistant)}'
+            f'<div class="{text_class}">'
+            f'<div class="message-body">{content}</div>'
+            f'{attachments}'
+            f'{actions_html(history, i, role, info_message)}'
             f'</div>'
             f'</div>'
         )
 
-    output += "</div></div>"
+    # Determine range
+    start_idx = len(history['visible']) - 1 if last_message_only else 0
+    end_idx = len(history['visible'])
+
+    for i in range(start_idx, end_idx):
+        row_visible = history['visible'][i]
+        row_internal = history['internal'][i]
+
+        # Convert content
+        if last_message_only:
+            converted_visible = [None, convert_to_markdown_wrapped(row_visible[1], message_id=i, use_cache=i != len(history['visible']) - 1)]
+        else:
+            converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
+
+        # Generate messages
+        if not last_message_only and converted_visible[0]:
+            output += create_message("user", converted_visible[0], row_internal[0])
+
+        output += create_message("assistant", converted_visible[1], row_internal[1])
+
+    if not last_message_only:
+        output += "</div></div>"
+
     return output
 
 
@@ -644,15 +656,15 @@ def time_greeting():
         return "Good evening!"
 
 
-def chat_html_wrapper(history, name1, name2, mode, style, character, reset_cache=False):
+def chat_html_wrapper(history, name1, name2, mode, style, character, reset_cache=False, last_message_only=False):
     if len(history['visible']) == 0:
         greeting = f"<div class=\"welcome-greeting\">{time_greeting()} How can I help you today?</div>"
         result = f'<div class="chat" id="chat">{greeting}</div>'
     elif mode == 'instruct':
-        result = generate_instruct_html(history)
+        result = generate_instruct_html(history, last_message_only=last_message_only)
     elif style == 'wpp':
-        result = generate_chat_html(history, name1, name2)
+        result = generate_chat_html(history, name1, name2, last_message_only=last_message_only)
     else:
-        result = generate_cai_chat_html(history, name1, name2, style, character, reset_cache)
+        result = generate_cai_chat_html(history, name1, name2, style, character, reset_cache=reset_cache, last_message_only=last_message_only)
 
-    return {'html': result}
+    return {'html': result, 'last_message_only': last_message_only}
diff --git a/modules/shared.py b/modules/shared.py
index d2305f30..f712f7f8 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -21,7 +21,7 @@ lora_names = []
 # Generation variables
 stop_everything = False
 generation_lock = None
-processing_message = '*Is typing...*'
+processing_message = ''
 
 # UI variables
 gradio = {}
@@ -47,7 +47,6 @@ settings = {
     'max_new_tokens_max': 4096,
     'prompt_lookup_num_tokens': 0,
     'max_tokens_second': 0,
-    'max_updates_second': 12,
     'auto_max_new_tokens': True,
     'ban_eos_token': False,
     'add_bos_token': True,
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 1fd6d810..0d499d50 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -65,41 +65,39 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
             all_stop_strings += st
 
     shared.stop_everything = False
-    last_update = -1
     reply = ''
     is_stream = state['stream']
     if len(all_stop_strings) > 0 and not state['stream']:
         state = copy.deepcopy(state)
         state['stream'] = True
 
-    min_update_interval = 0
-    if state.get('max_updates_second', 0) > 0:
-        min_update_interval = 1 / state['max_updates_second']
-
     # Generate
+    last_update = -1
+    latency_threshold = 1 / 1000
     for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat):
+        cur_time = time.monotonic()
         reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
         if escape_html:
             reply = html.escape(reply)
 
         if is_stream:
-            cur_time = time.time()
-
             # Limit number of tokens/second to make text readable in real time
             if state['max_tokens_second'] > 0:
                 diff = 1 / state['max_tokens_second'] - (cur_time - last_update)
                 if diff > 0:
                     time.sleep(diff)
 
-                last_update = time.time()
+                last_update = time.monotonic()
                 yield reply
 
             # Limit updates to avoid lag in the Gradio UI
             # API updates are not limited
             else:
-                if cur_time - last_update > min_update_interval:
-                    last_update = cur_time
+                # If 'generate_func' takes less than 0.001 seconds to yield the next token
+                # (equivalent to more than 1000 tok/s), assume that the UI is lagging behind and skip yielding
+                if (cur_time - last_update) > latency_threshold:
                     yield reply
+                last_update = time.monotonic()
 
         if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything):
             break
diff --git a/modules/ui.py b/modules/ui.py
index 9f4d67cb..14a09d2b 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -6,6 +6,7 @@ import yaml
 
 import extensions
 from modules import shared
+from modules.chat import load_history
 
 with open(Path(__file__).resolve().parent / '../css/NotoSans/stylesheet.css', 'r') as f:
     css = f.read()
@@ -194,7 +195,6 @@ def list_interface_input_elements():
         'max_new_tokens',
         'prompt_lookup_num_tokens',
         'max_tokens_second',
-        'max_updates_second',
         'do_sample',
         'dynamic_temperature',
         'temperature_last',
@@ -270,6 +270,10 @@ def gather_interface_values(*args):
     if not shared.args.multi_user:
         shared.persistent_interface_state = output
 
+    # Prevent history loss if backend is restarted but UI is not refreshed
+    if output['history'] is None and output['unique_id'] is not None:
+        output['history'] = load_history(output['unique_id'], output['character_menu'], output['mode'])
+
     return output
 
 
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 822b77b8..0d5a2c18 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -18,7 +18,7 @@ def create_ui():
     mu = shared.args.multi_user
 
     shared.gradio['Chat input'] = gr.State()
-    shared.gradio['history'] = gr.JSON(visible=False)
+    shared.gradio['history'] = gr.State({'internal': [], 'visible': [], 'metadata': {}})
 
     with gr.Tab('Chat', id='Chat', elem_id='chat-tab'):
         with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
@@ -195,7 +195,7 @@ def create_event_handlers():
     shared.reload_inputs = gradio(reload_arr)
 
     # Morph HTML updates instead of updating everything
-    shared.gradio['display'].change(None, gradio('display'), None, js="(data) => handleMorphdomUpdate(data.html)")
+    shared.gradio['display'].change(None, gradio('display'), None, js="(data) => handleMorphdomUpdate(data)")
 
     shared.gradio['Generate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 733d0901..84f9fbfc 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -71,8 +71,6 @@ def create_ui(default_preset):
                             shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label='max_new_tokens', info='⚠️ Setting this too high can cause prompt truncation.')
                             shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
                             shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
-                            shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')
-
                 with gr.Column():
                     with gr.Row():
                         with gr.Column():
diff --git a/user_data/settings-template.yaml b/user_data/settings-template.yaml
index ce0f77e1..db481e84 100644
--- a/user_data/settings-template.yaml
+++ b/user_data/settings-template.yaml
@@ -18,7 +18,6 @@ max_new_tokens_min: 1
 max_new_tokens_max: 4096
 prompt_lookup_num_tokens: 0
 max_tokens_second: 0
-max_updates_second: 12
 auto_max_new_tokens: true
 ban_eos_token: false
 add_bos_token: true

From 7278548cd18a9ba05062eb2db59d7f2965d8a9f6 Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Mon, 2 Jun 2025 09:57:55 -0300
Subject: [PATCH 50/59] Simplify the one-click installer (#7039)

---
 one_click.py | 253 +++++++++++++++++++++++++--------------------------
 1 file changed, 123 insertions(+), 130 deletions(-)

diff --git a/one_click.py b/one_click.py
index 482a6aa9..cccb0dc9 100644
--- a/one_click.py
+++ b/one_click.py
@@ -70,12 +70,8 @@ def is_installed():
 def cpu_has_avx2():
     try:
         import cpuinfo
-
         info = cpuinfo.get_cpu_info()
-        if 'avx2' in info['flags']:
-            return True
-        else:
-            return False
+        return 'avx2' in info['flags']
     except:
         return True
 
@@ -83,30 +79,112 @@ def cpu_has_avx2():
 def cpu_has_amx():
     try:
         import cpuinfo
-
         info = cpuinfo.get_cpu_info()
-        if 'amx' in info['flags']:
-            return True
-        else:
-            return False
+        return 'amx' in info['flags']
     except:
         return True
 
 
-def torch_version():
-    site_packages_path = None
-    for sitedir in site.getsitepackages():
-        if "site-packages" in sitedir and conda_env_path in sitedir:
-            site_packages_path = sitedir
-            break
+def load_state():
+    """Load installer state from JSON file"""
+    if os.path.exists(state_file):
+        try:
+            with open(state_file, 'r') as f:
+                return json.load(f)
+        except:
+            return {}
+    return {}
 
-    if site_packages_path:
-        torch_version_file = open(os.path.join(site_packages_path, 'torch', 'version.py')).read().splitlines()
-        torver = [line for line in torch_version_file if line.startswith('__version__')][0].split('__version__ = ')[1].strip("'")
+
+def save_state(state):
+    """Save installer state to JSON file"""
+    with open(state_file, 'w') as f:
+        json.dump(state, f)
+
+
+def get_gpu_choice():
+    """Get GPU choice from state file or ask user"""
+    state = load_state()
+    gpu_choice = state.get('gpu_choice')
+
+    if not gpu_choice:
+        if "GPU_CHOICE" in os.environ:
+            choice = os.environ["GPU_CHOICE"].upper()
+            print_big_message(f"Selected GPU choice \"{choice}\" based on the GPU_CHOICE environment variable.")
+        else:
+            choice = get_user_choice(
+                "What is your GPU?",
+                {
+                    'A': 'NVIDIA - CUDA 12.4',
+                    'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4',
+                    'C': 'Apple M Series',
+                    'D': 'Intel Arc (beta)',
+                    'N': 'CPU mode'
+                },
+            )
+
+        # Convert choice to GPU name
+        gpu_choice = {"A": "NVIDIA", "B": "AMD", "C": "APPLE", "D": "INTEL", "N": "NONE"}[choice]
+
+        # Save choice to state
+        state['gpu_choice'] = gpu_choice
+        save_state(state)
+
+    return gpu_choice
+
+
+def get_pytorch_install_command(gpu_choice):
+    """Get PyTorch installation command based on GPU choice"""
+    base_cmd = f"python -m pip install torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION} "
+
+    if gpu_choice == "NVIDIA":
+        return base_cmd + "--index-url https://download.pytorch.org/whl/cu124"
+    elif gpu_choice == "AMD":
+        return base_cmd + "--index-url https://download.pytorch.org/whl/rocm6.2.4"
+    elif gpu_choice in ["APPLE", "NONE"]:
+        return base_cmd + "--index-url https://download.pytorch.org/whl/cpu"
+    elif gpu_choice == "INTEL":
+        if is_linux():
+            return "python -m pip install torch==2.1.0a0 torchvision==0.16.0a0 torchaudio==2.1.0a0 intel-extension-for-pytorch==2.1.10+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
+        else:
+            return "python -m pip install torch==2.1.0a0 torchvision==0.16.0a0 torchaudio==2.1.0a0 intel-extension-for-pytorch==2.1.10 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
     else:
-        from torch import __version__ as torver
+        return base_cmd
 
-    return torver
+
+def get_pytorch_update_command(gpu_choice):
+    """Get PyTorch update command based on GPU choice"""
+    base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}"
+
+    if gpu_choice == "NVIDIA":
+        return f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
+    elif gpu_choice == "AMD":
+        return f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4"
+    elif gpu_choice in ["APPLE", "NONE"]:
+        return f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
+    elif gpu_choice == "INTEL":
+        intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
+        return f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
+    else:
+        return base_cmd
+
+
+def get_requirements_file(gpu_choice):
+    """Get requirements file path based on GPU choice"""
+    requirements_base = os.path.join("requirements", "full")
+
+    if gpu_choice == "AMD":
+        file_name = f"requirements_amd{'_noavx2' if not cpu_has_avx2() else ''}.txt"
+    elif gpu_choice == "APPLE":
+        file_name = f"requirements_apple_{'intel' if is_x86_64() else 'silicon'}.txt"
+    elif gpu_choice in ["INTEL", "NONE"]:
+        file_name = f"requirements_cpu_only{'_noavx2' if not cpu_has_avx2() else ''}.txt"
+    elif gpu_choice == "NVIDIA":
+        file_name = f"requirements{'_noavx2' if not cpu_has_avx2() else ''}.txt"
+    else:
+        raise ValueError(f"Unknown GPU choice: {gpu_choice}")
+
+    return os.path.join(requirements_base, file_name)
 
 
 def get_current_commit():
@@ -209,28 +287,8 @@ def get_user_choice(question, options_dict):
 
 def update_pytorch_and_python():
     print_big_message("Checking for PyTorch updates.")
-
-    # Update the Python version. Left here for future reference in case this becomes necessary.
-    # print_big_message("Checking for PyTorch and Python updates.")
-    # current_python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
-    # if current_python_version != PYTHON_VERSION:
-    #     run_cmd(f"conda install -y python={PYTHON_VERSION}", assert_success=True, environment=True)
-
-    torver = torch_version()
-    base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}"
-
-    if "+cu" in torver:
-        install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
-    elif "+rocm" in torver:
-        install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4"
-    elif "+cpu" in torver:
-        install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
-    elif "+cxx11" in torver:
-        intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
-        install_cmd = f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
-    else:
-        install_cmd = base_cmd
-
+    gpu_choice = get_gpu_choice()
+    install_cmd = get_pytorch_update_command(gpu_choice)
     run_cmd(install_cmd, assert_success=True, environment=True)
 
 
@@ -256,43 +314,11 @@ def install_webui():
     if os.path.isfile(state_file):
         os.remove(state_file)
 
-    # Ask the user for the GPU vendor
-    if "GPU_CHOICE" in os.environ:
-        choice = os.environ["GPU_CHOICE"].upper()
-        print_big_message(f"Selected GPU choice \"{choice}\" based on the GPU_CHOICE environment variable.")
-
-        # Warn about changed meanings and handle old choices
-        if choice == "B":
-            print_big_message("Warning: GPU_CHOICE='B' now means 'AMD' in the new version.")
-        elif choice == "C":
-            print_big_message("Warning: GPU_CHOICE='C' now means 'Apple M Series' in the new version.")
-        elif choice == "D":
-            print_big_message("Warning: GPU_CHOICE='D' now means 'Intel Arc' in the new version.")
-    else:
-        choice = get_user_choice(
-            "What is your GPU?",
-            {
-                'A': 'NVIDIA - CUDA 12.4',
-                'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4',
-                'C': 'Apple M Series',
-                'D': 'Intel Arc (beta)',
-                'N': 'CPU mode'
-            },
-        )
-
-    # Convert choices to GPU names for compatibility
-    gpu_choice_to_name = {
-        "A": "NVIDIA",
-        "B": "AMD",
-        "C": "APPLE",
-        "D": "INTEL",
-        "N": "NONE"
-    }
-
-    selected_gpu = gpu_choice_to_name[choice]
+    # Get GPU choice and save it to state
+    gpu_choice = get_gpu_choice()
 
     # Write a flag to CMD_FLAGS.txt for CPU mode
-    if selected_gpu == "NONE":
+    if gpu_choice == "NONE":
         cmd_flags_path = os.path.join(script_dir, "user_data", "CMD_FLAGS.txt")
         with open(cmd_flags_path, 'r+') as cmd_flags_file:
             if "--cpu" not in cmd_flags_file.read():
@@ -300,34 +326,20 @@ def install_webui():
                 cmd_flags_file.write("\n--cpu\n")
 
     # Handle CUDA version display
-    elif any((is_windows(), is_linux())) and selected_gpu == "NVIDIA":
+    elif any((is_windows(), is_linux())) and gpu_choice == "NVIDIA":
         print("CUDA: 12.4")
 
     # No PyTorch for AMD on Windows (?)
-    elif is_windows() and selected_gpu == "AMD":
+    elif is_windows() and gpu_choice == "AMD":
         print("PyTorch setup on Windows is not implemented yet. Exiting...")
         sys.exit(1)
 
-    # Find the Pytorch installation command
-    install_pytorch = f"python -m pip install torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION} "
-
-    if selected_gpu == "NVIDIA":
-        install_pytorch += "--index-url https://download.pytorch.org/whl/cu124"
-    elif selected_gpu == "AMD":
-        install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.2.4"
-    elif selected_gpu in ["APPLE", "NONE"]:
-        install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
-    elif selected_gpu == "INTEL":
-        if is_linux():
-            install_pytorch = "python -m pip install torch==2.1.0a0 torchvision==0.16.0a0 torchaudio==2.1.0a0 intel-extension-for-pytorch==2.1.10+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
-        else:
-            install_pytorch = "python -m pip install torch==2.1.0a0 torchvision==0.16.0a0 torchaudio==2.1.0a0 intel-extension-for-pytorch==2.1.10 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
-
     # Install Git and then Pytorch
     print_big_message("Installing PyTorch.")
+    install_pytorch = get_pytorch_install_command(gpu_choice)
     run_cmd(f"conda install -y ninja git && {install_pytorch} && python -m pip install py-cpuinfo==9.0.0", assert_success=True, environment=True)
 
-    if selected_gpu == "INTEL":
+    if gpu_choice == "INTEL":
         # Install oneAPI dependencies via conda
         print_big_message("Installing Intel oneAPI runtime libraries.")
         run_cmd("conda install -y -c https://software.repos.intel.com/python/conda/ -c conda-forge dpcpp-cpp-rt=2024.0 mkl-dpcpp=2024.0", environment=True)
@@ -349,31 +361,15 @@ def update_requirements(initial_installation=False, pull=True):
             assert_success=True
         )
 
-    torver = torch_version()
-    requirements_base = os.path.join("requirements", "full")
-
-    if "+rocm" in torver:
-        file_name = f"requirements_amd{'_noavx2' if not cpu_has_avx2() else ''}.txt"
-    elif "+cpu" in torver or "+cxx11" in torver:
-        file_name = f"requirements_cpu_only{'_noavx2' if not cpu_has_avx2() else ''}.txt"
-    elif is_macos():
-        file_name = f"requirements_apple_{'intel' if is_x86_64() else 'silicon'}.txt"
-    else:
-        file_name = f"requirements{'_noavx2' if not cpu_has_avx2() else ''}.txt"
-
-    requirements_file = os.path.join(requirements_base, file_name)
-
-    # Load state from JSON file
     current_commit = get_current_commit()
-    wheels_changed = False
-    if os.path.exists(state_file):
-        with open(state_file, 'r') as f:
-            last_state = json.load(f)
-
-        if 'wheels_changed' in last_state or last_state.get('last_installed_commit') != current_commit:
+    wheels_changed = not os.path.exists(state_file)
+    if not wheels_changed:
+        state = load_state()
+        if 'wheels_changed' in state or state.get('last_installed_commit') != current_commit:
             wheels_changed = True
-    else:
-        wheels_changed = True
+
+    gpu_choice = get_gpu_choice()
+    requirements_file = get_requirements_file(gpu_choice)
 
     if pull:
         # Read .whl lines before pulling
@@ -409,19 +405,17 @@ def update_requirements(initial_installation=False, pull=True):
                 print_big_message(f"File '{file}' was updated during 'git pull'. Please run the script again.")
 
                 # Save state before exiting
-                current_state = {}
+                state = load_state()
                 if wheels_changed:
-                    current_state['wheels_changed'] = True
-
-                with open(state_file, 'w') as f:
-                    json.dump(current_state, f)
-
+                    state['wheels_changed'] = True
+                save_state(state)
                 sys.exit(1)
 
     # Save current state
-    current_state = {'last_installed_commit': current_commit}
-    with open(state_file, 'w') as f:
-        json.dump(current_state, f)
+    state = load_state()
+    state['last_installed_commit'] = current_commit
+    state.pop('wheels_changed', None)  # Remove wheels_changed flag
+    save_state(state)
 
     if os.environ.get("INSTALL_EXTENSIONS", "").lower() in ("yes", "y", "true", "1", "t", "on"):
         install_extensions_requirements()
@@ -432,11 +426,10 @@ def update_requirements(initial_installation=False, pull=True):
     # Update PyTorch
     if not initial_installation:
         update_pytorch_and_python()
-        torver = torch_version()
         clean_outdated_pytorch_cuda_dependencies()
 
     print_big_message(f"Installing webui requirements from file: {requirements_file}")
-    print(f"TORCH: {torver}\n")
+    print(f"GPU Choice: {gpu_choice}\n")
 
     # Prepare the requirements file
     textgen_requirements = open(requirements_file).read().splitlines()

From b30a73016d626e985e248de15fa65e5a531c8bd2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 2 Jun 2025 07:49:22 -0700
Subject: [PATCH 51/59] Remove the "Is typing..." yield by default

---
 modules/chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index f1ea16f1..3c4c3636 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -656,7 +656,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         update_message_metadata(output['metadata'], "user", row_idx, timestamp=get_current_timestamp())
 
         # *Is typing...*
-        if loading_message:
+        if loading_message and shared.processing_message:
             yield {
                 'visible': output['visible'][:-1] + [[output['visible'][-1][0], shared.processing_message]],
                 'internal': output['internal'],
@@ -680,7 +680,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             })
             output['metadata'][key]["current_version_index"] = len(output['metadata'][key]["versions"]) - 1
 
-            if loading_message:
+            if loading_message and shared.processing_message:
                 yield {
                     'visible': output['visible'][:-1] + [[visible_text, shared.processing_message]],
                     'internal': output['internal'][:-1] + [[text, '']],

From b38ec0ec385d44d49d3fe7adf2ad77ae62302214 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 2 Jun 2025 11:33:17 -0700
Subject: [PATCH 52/59] Update llama.cpp

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 16 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 04d97220..277f8249 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index f807199d..dbf35c34 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 4fb70eb1..2e5eb6c9 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index a311ab9b..9a19ab29 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -33,7 +33,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 30e8409a..973d9bfb 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 70949949..4a48a51f 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 318bb93a..76bde864 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -33,5 +33,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 7c3635cc..6cd0fa65 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index bde310e1..60ce941e 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 521edc0c..b1649bc9 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index ef7946ff..571eba52 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -19,6 +19,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index a3ad743e..88170cf3 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index eec052d3..e96cef49 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index c9898a05..78f94aa5 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 0de9c7cb..3e41427d 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 2bfb4d51..022ebb61 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -19,5 +19,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 93b3752cdf9f43dd391462168e2e14dd2ab75643 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 4 Jun 2025 09:40:30 -0700
Subject: [PATCH 53/59] Revert "Remove the "Is typing..." yield by default"

This reverts commit b30a73016d626e985e248de15fa65e5a531c8bd2.
---
 modules/chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 3c4c3636..f1ea16f1 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -656,7 +656,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         update_message_metadata(output['metadata'], "user", row_idx, timestamp=get_current_timestamp())
 
         # *Is typing...*
-        if loading_message and shared.processing_message:
+        if loading_message:
             yield {
                 'visible': output['visible'][:-1] + [[output['visible'][-1][0], shared.processing_message]],
                 'internal': output['internal'],
@@ -680,7 +680,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             })
             output['metadata'][key]["current_version_index"] = len(output['metadata'][key]["versions"]) - 1
 
-            if loading_message and shared.processing_message:
+            if loading_message:
                 yield {
                     'visible': output['visible'][:-1] + [[visible_text, shared.processing_message]],
                     'internal': output['internal'][:-1] + [[text, '']],

From 9bd7359ffab5e434b7cdfdb43ee91cb3ad397c0d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 4 Jun 2025 10:47:14 -0700
Subject: [PATCH 54/59] Scroll the textarea into view when editing a message

---
 js/global_scope_js.js | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index d5140c93..801f1574 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -95,6 +95,12 @@ function startEditing(messageElement, messageBody, isUserMessage) {
   editingInterface.textarea.focus();
   editingInterface.textarea.setSelectionRange(rawText.length, rawText.length);
 
+  // Scroll the textarea into view
+  editingInterface.textarea.scrollIntoView({
+    behavior: "smooth",
+    block: "center"
+  });
+
   // Setup event handlers
   setupEditingHandlers(editingInterface.textarea, messageElement, originalHTML, messageBody, isUserMessage);
 }

From 66a75c899a4b0786cd8744886a189864923287b5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 4 Jun 2025 10:59:43 -0700
Subject: [PATCH 55/59] Improve the scrollbars in code blocks

---
 js/main.js | 1 +
 1 file changed, 1 insertion(+)

diff --git a/js/main.js b/js/main.js
index 8090937f..2e72d82e 100644
--- a/js/main.js
+++ b/js/main.js
@@ -229,6 +229,7 @@ function doSyntaxHighlighting() {
         codeBlocks.forEach((codeBlock) => {
           hljs.highlightElement(codeBlock);
           codeBlock.setAttribute("data-highlighted", "true");
+          codeBlock.classList.add("pretty_scrollbar");
         });
 
         renderMathInElement(messageBody, {

From 3d676cd50f8661ca96a20a452611422acb47177c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 4 Jun 2025 11:02:04 -0700
Subject: [PATCH 56/59] Optimize syntax highlighting

---
 js/main.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/js/main.js b/js/main.js
index 2e72d82e..9a620fa9 100644
--- a/js/main.js
+++ b/js/main.js
@@ -217,7 +217,7 @@ function isElementVisibleOnScreen(element) {
 }
 
 function doSyntaxHighlighting() {
-  const messageBodies = document.querySelectorAll(".message-body");
+  const messageBodies = document.getElementById("chat").querySelectorAll(".message-body");
 
   if (messageBodies.length > 0) {
     observer.disconnect();

From 3829507d0fd66eccc532b5d8d0e3d77c38143d0c Mon Sep 17 00:00:00 2001
From: Hanusz Leszek <leszek.hanusz@gmail.com>
Date: Wed, 4 Jun 2025 20:13:36 +0200
Subject: [PATCH 57/59] Stop model during graceful shutdown (#7042)

---
 server.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/server.py b/server.py
index c22ed1f1..99d2e171 100644
--- a/server.py
+++ b/server.py
@@ -60,6 +60,14 @@ from modules.utils import gradio
 
 def signal_handler(sig, frame):
     logger.info("Received Ctrl+C. Shutting down Text generation web UI gracefully.")
+
+    # Explicitly stop LlamaServer to avoid __del__ cleanup issues during shutdown
+    if shared.model and shared.model.__class__.__name__ == 'LlamaServer':
+        try:
+            shared.model.stop()
+        except:
+            pass
+
     sys.exit(0)
 
 

From 977ec801b7682c3239fe3e6fdfcb8b90c1e802f3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 5 Jun 2025 06:33:45 -0700
Subject: [PATCH 58/59] Improve table colors in instruct mode

---
 css/html_instruct_style.css | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 6ad250aa..9831ee8f 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -17,6 +17,14 @@
     color: #d1d5db !important;
 }
 
+.chat .message-body :is(th, td) {
+    border-color: #40404096 !important;
+}
+
+.dark .chat .message-body :is(th, td) {
+    border-color: #ffffff75 !important;
+}
+
 .chat .message-body :is(p, ul, ol) {
     margin: 1.25em 0 !important;
 }

From d47c8eb956a72ebc7c1f582718758697aef62118 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 5 Jun 2025 06:56:24 -0700
Subject: [PATCH 59/59] Remove quotes from LLM-generated websearch query
 (closes #7045).

Fix by @Quiet-Joker
---
 modules/chat.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index f1ea16f1..14f2a4f7 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -604,7 +604,12 @@ def generate_search_query(user_message, state):
 
     query = ""
     for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True):
-        query = reply.strip()
+        query = reply
+
+    # Strip and remove surrounding quotes if present
+    query = query.strip()
+    if len(query) >= 2 and query.startswith('"') and query.endswith('"'):
+        query = query[1:-1]
 
     return query