From c12a53c998ce39ec762b9f7895861f1d94c2d827 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 1 May 2025 19:46:56 -0700
Subject: [PATCH 001/164] Use turboderp's exllamav2 wheels

---
 requirements/full/requirements.txt               | 6 +++---
 requirements/full/requirements_amd.txt           | 4 ++--
 requirements/full/requirements_amd_noavx2.txt    | 4 ++--
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_noavx2.txt        | 6 +++---
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 6f265eba..c0ace41b 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -34,8 +34,8 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_c
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index c8e75ee7..91582eb3 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -30,5 +30,5 @@ tiktoken
 
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index e54d6d9c..7b86050e 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -30,5 +30,5 @@ tiktoken
 
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index d714ea3d..cc747edb 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -32,4 +32,4 @@ tiktoken
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 89f4f576..67b3260e 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -33,4 +33,4 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_c
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index e216c9cd..3575d352 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -34,8 +34,8 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_c
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

From f8aaf3c23a793b60ce7452213304acb493be98af Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 1 May 2025 19:50:46 -0700
Subject: [PATCH 002/164] Use ROCm 6.2.4 on AMD

---
 README.md    | 2 +-
 one_click.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 4b541b9e..3280186c 100644
--- a/README.md
+++ b/README.md
@@ -90,7 +90,7 @@ conda activate textgen
 |--------|---------|---------|
 | Linux/WSL | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
 | Linux/WSL | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu` |
-| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.1` |
+| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.2.4` |
 | MacOS + MPS | Any | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
 | Windows | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
 | Windows | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
diff --git a/one_click.py b/one_click.py
index 065afd99..cb16b813 100644
--- a/one_click.py
+++ b/one_click.py
@@ -222,7 +222,7 @@ def update_pytorch_and_python():
     if "+cu" in torver:
         install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
     elif "+rocm" in torver:
-        install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
+        install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4"
     elif "+cpu" in torver:
         install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
     elif "+cxx11" in torver:
@@ -273,7 +273,7 @@ def install_webui():
             "What is your GPU?",
             {
                 'A': 'NVIDIA - CUDA 12.4',
-                'B': 'AMD - Linux/macOS only, requires ROCm 6.1',
+                'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4',
                 'C': 'Apple M Series',
                 'D': 'Intel Arc (beta)',
                 'N': 'CPU mode'
@@ -314,7 +314,7 @@ def install_webui():
     if selected_gpu == "NVIDIA":
         install_pytorch += "--index-url https://download.pytorch.org/whl/cu124"
     elif selected_gpu == "AMD":
-        install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.1"
+        install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.2.4"
     elif selected_gpu in ["APPLE", "NONE"]:
         install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
     elif selected_gpu == "INTEL":

From d5c407cf35453ba2d06eea942942ff11cdc7993b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 1 May 2025 20:05:36 -0700
Subject: [PATCH 003/164] Use Vulkan instead of ROCm for llama.cpp on AMD

---
 requirements/full/requirements_amd.txt        | 3 ++-
 requirements/full/requirements_amd_noavx2.txt | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 91582eb3..24eeee6a 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -29,6 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 7b86050e..99716f3c 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -29,6 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

From 9e3867dc8358baf153d6f7c182496dad158696a4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 2 May 2025 09:36:15 -0700
Subject: [PATCH 004/164] llama.cpp: Fix manual random seeds

---
 modules/text_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 8d091868..b9bf9b16 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -480,7 +480,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
     For models that do not use the transformers library for sampling
     """
 
-    seed = set_manual_seed(state['seed'])
+    state['seed'] = set_manual_seed(state['seed'])
     t0 = time.time()
     reply = ''
     try:

From 3f26b0408bd02f500acc8c090a7e50ee286051b5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 2 May 2025 16:17:22 -0700
Subject: [PATCH 005/164] Fix after 9e3867dc8358baf153d6f7c182496dad158696a4

---
 modules/text_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index b9bf9b16..8fd65dc4 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -500,7 +500,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
         t1 = time.time()
         original_tokens = len(encode(original_question)[0])
         new_tokens = len(encode(original_question + reply)[0]) - original_tokens
-        print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
+        print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {state["seed"]})')
         return
 
 

From 905afced1c8339833280de254cd597b389a3dade Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 2 May 2025 16:32:22 -0700
Subject: [PATCH 006/164] Add a --portable flag to hide things in portable mode

---
 modules/presets.py       |  9 ++++++++-
 modules/shared.py        |  1 +
 modules/ui_model_menu.py | 17 +++++++++++------
 modules/ui_parameters.py |  2 +-
 server.py                |  5 +++--
 start_linux.sh           |  2 +-
 start_macos.sh           |  2 +-
 start_windows.bat        |  2 +-
 8 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/modules/presets.py b/modules/presets.py
index a432bf52..50d0f985 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -11,7 +11,7 @@ from modules.logging_colors import logger
 
 
 def default_preset():
-    return {
+    result = {
         'temperature': 1,
         'dynatemp_low': 1,
         'dynatemp_high': 1,
@@ -50,6 +50,13 @@ def default_preset():
         'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
     }
 
+    if shared.args.portable:
+        samplers = result['sampler_priority'].split('\n')
+        samplers = [sampler for sampler in samplers if sampler in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature", "repetition_penalty"]]
+        result['sampler_priority'] = '\n'.join(samplers)
+
+    return result
+
 
 def presets_params():
     return [k for k in default_preset()]
diff --git a/modules/shared.py b/modules/shared.py
index fb10c014..39b0bdaa 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -190,6 +190,7 @@ group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certific
 group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None)
 group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy')
 group.add_argument('--old-colors', action='store_true', help='Use the legacy Gradio colors, before the December/2024 update.')
+group.add_argument('--portable', action='store_true', help='Hide features not available in portable mode like training.')
 
 # API
 group = parser.add_argument_group('API')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index d13bcff7..4a49d209 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -35,14 +35,17 @@ def create_ui():
                             shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
 
                     with gr.Column():
-                        with gr.Row():
-                            shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
-                            ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
-                            shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
+                        if shared.args.portable:
+                            pass
+                        else:
+                            with gr.Row():
+                                shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
+                                ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
+                                shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
 
         with gr.Row():
             with gr.Column():
-                shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None)
+                shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None)
                 with gr.Blocks():
                     with gr.Row():
                         with gr.Column():
@@ -150,7 +153,9 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
 
-    shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
+    if not shared.args.portable:
+        shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
+
     shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
     shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
     shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 3f609d71..071b30b6 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -21,7 +21,7 @@ def create_ui(default_preset):
                         shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button')
 
                 with gr.Column():
-                    shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown')
+                    shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()) if not shared.args.portable else ['llama.cpp'], value="All", elem_classes='slim-dropdown')
 
             with gr.Row():
                 with gr.Column():
diff --git a/server.py b/server.py
index 169578a5..b0b9e633 100644
--- a/server.py
+++ b/server.py
@@ -90,7 +90,7 @@ def create_interface():
         'instruction_template_str': shared.settings['instruction_template_str'],
         'prompt_menu-default': shared.settings['prompt-default'],
         'prompt_menu-notebook': shared.settings['prompt-notebook'],
-        'filter_by_loader': shared.args.loader or 'All'
+        'filter_by_loader': (shared.args.loader or 'All') if not shared.args.portable else 'llama.cpp'
     })
 
     if Path("user_data/cache/pfp_character.png").exists():
@@ -127,7 +127,8 @@ def create_interface():
 
         ui_parameters.create_ui(shared.settings['preset'])  # Parameters tab
         ui_model_menu.create_ui()  # Model tab
-        training.create_ui()  # Training tab
+        if not shared.args.portable:
+            training.create_ui()  # Training tab
         ui_session.create_ui()  # Session tab
 
         # Generation events
diff --git a/start_linux.sh b/start_linux.sh
index 00082f07..c74f1272 100755
--- a/start_linux.sh
+++ b/start_linux.sh
@@ -4,7 +4,7 @@ cd "$(dirname "${BASH_SOURCE[0]}")"
 
 # Portable install case
 if [ -d "portable_env" ]; then
-    ./portable_env/bin/python3 server.py --api --auto-launch "$@"
+    ./portable_env/bin/python3 server.py --portable --api --auto-launch "$@"
     exit $?
 fi
 
diff --git a/start_macos.sh b/start_macos.sh
index 628f59cc..7a060ba6 100755
--- a/start_macos.sh
+++ b/start_macos.sh
@@ -4,7 +4,7 @@ cd "$(dirname "${BASH_SOURCE[0]}")"
 
 # Portable install case
 if [ -d "portable_env" ]; then
-    ./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@"
+    ./portable_env/bin/python3 server.py --portable --api --auto-launch --api-port 5005 "$@"
     exit $?
 fi
 
diff --git a/start_windows.bat b/start_windows.bat
index 451b85e0..1616ee27 100755
--- a/start_windows.bat
+++ b/start_windows.bat
@@ -5,7 +5,7 @@ cd /D "%~dp0"
 
 @rem Portable install case
 if exist "portable_env" (
-    .\portable_env\python.exe server.py --api --auto-launch %*
+    .\portable_env\python.exe server.py --portable --api --auto-launch %*
     exit /b %errorlevel%
 )
 

From 4cea720da8cca27cbb5e8ac560019a55e6afb73a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 2 May 2025 16:38:28 -0700
Subject: [PATCH 007/164] UI: Remove the "Autoload the model" feature

---
 modules/shared.py                | 1 -
 modules/ui_model_menu.py         | 9 ++-------
 user_data/settings-template.yaml | 1 -
 3 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index 39b0bdaa..cfedb992 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -60,7 +60,6 @@ settings = {
     'custom_stopping_strings': '',
     'custom_token_bans': '',
     'negative_prompt': '',
-    'autoload_model': False,
     'dark_theme': True,
     'default_extensions': [],
     'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n    {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' -%}\n        {{- '' + message['content'] + '\\n\\n' -}}\n    {%- else -%}\n        {%- if message['role'] == 'user' -%}\n            {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n        {%- else -%}\n            {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n        {%- endif -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{-'### Response:\\n'-}}\n{%- endif -%}",
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 4a49d209..9361ef91 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -30,7 +30,7 @@ def create_ui():
                         with gr.Row():
                             shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
                             ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
-                            shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button', interactive=not mu)
+                            shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu)
                             shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
                             shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
 
@@ -108,9 +108,6 @@ def create_ui():
                                 shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
 
             with gr.Column():
-                with gr.Row():
-                    shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu)
-
                 with gr.Tab("Download"):
                     shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu)
                     shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1, interactive=not mu)
@@ -135,11 +132,10 @@ def create_event_handlers():
 
     # In this event handler, the interface state is read and updated
     # with the model defaults (if any), and then the model is loaded
-    # unless "autoload_model" is unchecked
     shared.gradio['model_menu'].change(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then(
-        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=True).success(
+        partial(load_model_wrapper, autoload=False), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
         handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
 
     shared.gradio['load_model'].click(
@@ -158,7 +154,6 @@ def create_event_handlers():
 
     shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
     shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
-    shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))
     shared.gradio['customized_template_submit'].click(save_instruction_template, gradio('model_menu', 'customized_template'), gradio('model_status'), show_progress=True)
 
 
diff --git a/user_data/settings-template.yaml b/user_data/settings-template.yaml
index 20896da3..ce0f77e1 100644
--- a/user_data/settings-template.yaml
+++ b/user_data/settings-template.yaml
@@ -31,7 +31,6 @@ seed: -1
 custom_stopping_strings: ''
 custom_token_bans: ''
 negative_prompt: ''
-autoload_model: false
 dark_theme: true
 default_extensions: []
 instruction_template_str: |-

From 3526b7923c9f5a3b3ba55056e445a660a03d2bc6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 2 May 2025 17:40:53 -0700
Subject: [PATCH 008/164] Remove extensions with requirements from portable
 builds

---
 .github/workflows/build-portable-release-cuda.yml   | 2 ++
 .github/workflows/build-portable-release-vulkan.yml | 2 ++
 .github/workflows/build-portable-release.yml        | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml
index fb9e61b0..571cbac0 100644
--- a/.github/workflows/build-portable-release-cuda.yml
+++ b/.github/workflows/build-portable-release-cuda.yml
@@ -102,6 +102,8 @@ jobs:
         shell: bash
         run: |
             rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
+            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
             CUDA_VERSION="${{ matrix.cuda }}"
diff --git a/.github/workflows/build-portable-release-vulkan.yml b/.github/workflows/build-portable-release-vulkan.yml
index 8de29791..4e88d4d9 100644
--- a/.github/workflows/build-portable-release-vulkan.yml
+++ b/.github/workflows/build-portable-release-vulkan.yml
@@ -101,6 +101,8 @@ jobs:
         shell: bash
         run: |
             rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
+            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
             AVX_SUPPORT="${{ matrix.avx }}"
diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml
index bdf96cec..6910ce2c 100644
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@@ -101,6 +101,8 @@ jobs:
         shell: bash
         run: |
             rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
+            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
             AVX_SUPPORT="${{ matrix.avx }}"

From d08acb4af9c2a4f4d0f7fd97babb217c0890e1c8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 2 May 2025 20:50:52 -0700
Subject: [PATCH 009/164] UI: Rename enable_thinking -> Enable thinking

---
 modules/ui_parameters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 071b30b6..733d0901 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -82,7 +82,7 @@ def create_ui(default_preset):
                             shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
                             shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
                             shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
-                            shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='enable_thinking', info='Used by Qwen3 to toggle <think> mode.')
+                            shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Qwen3 to toggle <think> mode.')
                             shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
                             shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
                             shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')

From b21bd8bb1e79466be945abfe417e92e52b63ec6f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 2 May 2025 22:41:49 -0700
Subject: [PATCH 010/164] UI: Invert user/assistant message colors in instruct
 mode

The goal is to make assistant messages more readable.
---
 css/html_instruct_style.css | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 4613b380..b98544a1 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -61,11 +61,11 @@
 }
 
 .dark .chat .user-message {
-    background: transparent;
+    background: var(--light-gray);
 }
 
 .dark .chat .assistant-message {
-    background: var(--light-gray);
+    background: transparent;
 }
 
 .chat .user-message .text,

From b71ef50e9d01c15a09c67b95e2032fed535c63ba Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 2 May 2025 23:45:58 -0700
Subject: [PATCH 011/164] UI: Add a min-height to prevent constant scrolling
 during chat streaming

---
 css/chat_style-Dark.css            | 2 ++
 css/chat_style-TheEncrypted777.css | 2 ++
 css/chat_style-cai-chat.css        | 1 +
 css/main.css                       | 5 +++++
 modules/html_generator.py          | 9 ++++++---
 5 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/css/chat_style-Dark.css b/css/chat_style-Dark.css
index 368a2a16..3b2bd385 100644
--- a/css/chat_style-Dark.css
+++ b/css/chat_style-Dark.css
@@ -1,5 +1,6 @@
 .message {
     display: grid;
+    align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
     padding-bottom: 28px;
     font-size: 18px;
@@ -102,6 +103,7 @@
 @media screen and (width <= 688px) {
     .message {
         display: grid;
+        align-items: start;
         grid-template-columns: 60px minmax(0, 1fr);
         padding-bottom: 25px;
         font-size: 15px;
diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css
index 6404f41d..25d26db8 100644
--- a/css/chat_style-TheEncrypted777.css
+++ b/css/chat_style-TheEncrypted777.css
@@ -2,6 +2,7 @@
 
 .message {
     display: grid;
+    align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
     padding-bottom: 28px;
     font-size: 18px;
@@ -100,6 +101,7 @@
 @media screen and (width <= 688px) {
     .message {
         display: grid;
+        align-items: start;
         grid-template-columns: 60px minmax(0, 1fr);
         padding-bottom: 25px;
         font-size: 15px;
diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css
index 93276bd3..223f6150 100644
--- a/css/chat_style-cai-chat.css
+++ b/css/chat_style-cai-chat.css
@@ -1,5 +1,6 @@
 .message {
     display: grid;
+    align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
     padding-bottom: 2em;
     font-size: 15px;
diff --git a/css/main.css b/css/main.css
index d6e5ac83..cf0dfde7 100644
--- a/css/main.css
+++ b/css/main.css
@@ -403,6 +403,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .chat-parent {
     height: calc(100dvh - 98px - var(--input-delta));
     overflow: auto !important;
+    /* scroll-behavior: smooth; */
     border-radius: 0 !important;
     margin-bottom: var(--input-delta) !important;
 }
@@ -1382,3 +1383,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     50% { opacity: 1; }
     100% { opacity: 0.6; }
 }
+
+.streaming {
+    min-height: 70vh;
+}
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 67d15b6e..a6f5f930 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -365,8 +365,9 @@ def generate_instruct_html(history):
                 f'</div>'
             )
 
+        streaming_class = " streaming" if i == len(history["visible"]) - 1 else ""
         output += (
-            f'<div class="assistant-message" '
+            f'<div class="assistant-message{streaming_class}" '
             f'data-raw="{html.escape(row_internal[1], quote=True)}">'
             f'<div class="text">'
             f'<div class="message-body">{converted_visible[1]}</div>'
@@ -414,8 +415,9 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
                 f'</div>'
             )
 
+        streaming_class = " streaming" if i == len(history["visible"]) - 1 else ""
         output += (
-            f'<div class="message" '
+            f'<div class="message{streaming_class}" '
             f'data-raw="{html.escape(row_internal[1], quote=True)}">'
             f'<div class="circle-bot">{img_bot}</div>'
             f'<div class="text">'
@@ -452,8 +454,9 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
                 f'</div>'
             )
 
+        streaming_class = " streaming" if i == len(history["visible"]) - 1 else ""
         output += (
-            f'<div class="message" '
+            f'<div class="message{streaming_class}" '
             f'data-raw="{html.escape(row_internal[1], quote=True)}">'
             f'<div class="text-bot">'
             f'<div class="message-body">{converted_visible[1]}</div>'

From ea60f14674a89d3a71e5504edacb8f64f148b57c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 3 May 2025 06:06:50 -0700
Subject: [PATCH 012/164] UI: Show the list of files if the user tries to
 download a GGUF repository

---
 modules/ui_model_menu.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 9361ef91..2c593df6 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -192,6 +192,26 @@ def load_lora_wrapper(selected_loras):
 
 def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):
     try:
+        # Handle direct GGUF URLs
+        if repo_id.startswith("https://") and ("huggingface.co" in repo_id) and (repo_id.endswith(".gguf") or repo_id.endswith(".gguf?download=true")):
+            try:
+                path = repo_id.split("huggingface.co/")[1]
+
+                # Extract the repository ID (first two parts of the path)
+                parts = path.split("/")
+                if len(parts) >= 2:
+                    extracted_repo_id = f"{parts[0]}/{parts[1]}"
+
+                    # Extract the filename (last part of the path)
+                    filename = repo_id.split("/")[-1]
+                    if "?download=true" in filename:
+                        filename = filename.replace("?download=true", "")
+
+                    repo_id = extracted_repo_id
+                    specific_file = filename
+            except:
+                pass
+
         if repo_id == "":
             yield ("Please enter a model path")
             return
@@ -205,6 +225,18 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
 
         yield ("Getting the download links from Hugging Face")
         links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file)
+
+        # Check for multiple GGUF files
+        gguf_files = [link for link in links if link.lower().endswith('.gguf')]
+        if len(gguf_files) > 1 and not specific_file:
+            output = "Multiple GGUF files found. Please copy one of the following filenames to the 'File name' field:\n\n```\n"
+            for link in gguf_files:
+                output += f"{Path(link).name}\n"
+
+            output += "```"
+            yield output
+            return
+
         if return_links:
             output = "```\n"
             for link in links:

From 4c2e3b168bc1751dbb3f1b222fdd749ad7a5d36e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 3 May 2025 06:51:20 -0700
Subject: [PATCH 013/164] llama.cpp: Add a retry mechanism when getting the
 logits (sometimes it fails)

---
 modules/llama_cpp_server.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index d9187db8..2ebeb560 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -210,14 +210,15 @@ class LlamaServer:
             pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
             print()
 
-        response = self.session.post(url, json=payload)
-        result = response.json()
+        for retry in range(5):
+            response = self.session.post(url, json=payload)
+            result = response.json()
 
-        if "completion_probabilities" in result:
-            if use_samplers:
-                return result["completion_probabilities"][0]["top_probs"]
-            else:
-                return result["completion_probabilities"][0]["top_logprobs"]
+            if "completion_probabilities" in result:
+                if use_samplers:
+                    return result["completion_probabilities"][0]["top_probs"]
+                else:
+                    return result["completion_probabilities"][0]["top_logprobs"]
         else:
             raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
 

From 5f5569e9ac21ffdcb335b0557909ad102104fc8f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 4 May 2025 06:20:24 -0700
Subject: [PATCH 014/164] Update README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3280186c..8a7b2467 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases
 
 To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again.
 
-You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
+You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, or `update_wizard_macos.sh`.
 
 <details>
 <summary>
@@ -55,7 +55,7 @@ Setup details and information about installing manually
 
 The script uses Miniconda to set up a Conda environment in the `installer_files` folder.
 
-If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, `cmd_macos.sh`, or `cmd_wsl.bat`.
+If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, or `cmd_macos.sh`.
 
 * There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root.
 * To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts.

From b7a5c7db8de89159144fadb59920045efc3fe544 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 4 May 2025 07:14:42 -0700
Subject: [PATCH 015/164] llama.cpp: Handle short arguments in --extra-flags

---
 modules/llama_cpp_server.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 2ebeb560..7244001a 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -317,9 +317,15 @@ class LlamaServer:
             for flag_item in extra_flags.split(','):
                 if '=' in flag_item:
                     flag, value = flag_item.split('=', 1)
-                    cmd += [f"--{flag}", value]
+                    if len(flag) <= 3:
+                        cmd += [f"-{flag}", value]
+                    else:
+                        cmd += [f"--{flag}", value]
                 else:
-                    cmd.append(f"--{flag_item}")
+                    if len(flag_item) <= 3:
+                        cmd.append(f"-{flag_item}")
+                    else:
+                        cmd.append(f"--{flag_item}")
 
         env = os.environ.copy()
         if os.name == 'posix':

From 7853fb1c8d701bb8b720b3907bdc50017911d6a6 Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Sun, 4 May 2025 18:58:37 -0300
Subject: [PATCH 016/164] Optimize the Chat tab (#6948)

---
 css/main.css       | 34 +++++++++++-----------------------
 js/main.js         |  8 +-------
 modules/ui_chat.py |  2 +-
 3 files changed, 13 insertions(+), 31 deletions(-)

diff --git a/css/main.css b/css/main.css
index cf0dfde7..64e96ccc 100644
--- a/css/main.css
+++ b/css/main.css
@@ -389,7 +389,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .chat {
     margin-left: auto;
     margin-right: auto;
-    min-height: var(--chat-height);
+    flex: 1;
     overflow-y: auto;
     display: flex;
     flex-direction: column;
@@ -401,11 +401,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 .chat-parent {
-    height: calc(100dvh - 98px - var(--input-delta));
+    flex: 1;
     overflow: auto !important;
-    /* scroll-behavior: smooth; */
     border-radius: 0 !important;
-    margin-bottom: var(--input-delta) !important;
 }
 
 .chat-parent .prose {
@@ -422,8 +420,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 .chat-parent.bigchat {
-    height: calc(100dvh - 98px - var(--input-delta)) !important;
-    margin-bottom: var(--input-delta) !important;
+    flex: 1;
 }
 
 .chat > .messages {
@@ -604,8 +601,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 .chat-input-positioned {
-    position: absolute;
-    bottom: 0;
     max-width: 54rem;
     left: 50%;
     transform: translateX(-50%);
@@ -790,7 +785,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-input-container {
-    min-width: 0 !important;
+    display: flex;
+    flex-direction: column;
 }
 
 #chat-input-container > .form {
@@ -799,9 +795,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-input-row {
-    padding-bottom: 1.5em;
-    padding-left: 1rem;
-    padding-right: 1rem;
+    padding: 1rem;
 }
 
 #chat-input-row.bigchat {
@@ -809,22 +803,16 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-col {
-    padding-bottom: 100px;
+    height: 100dvh;
+    display: flex;
+    flex-direction: column;
+    padding-bottom: 0;
 }
 
 @media screen and (width <= 924px) {
     #chat-col {
-        padding-bottom: 100px;
+        height: calc(100dvh - 132px);
         margin-top: 32px;
-        position: relative; /* Ensure positioning for the pseudo-element */
-    }
-
-    .chat-parent {
-        height: calc(100dvh - 98px - var(--input-delta) - 32px);
-    }
-
-    .chat-parent.bigchat {
-        height: calc(100dvh - 98px - var(--input-delta) - 32px) !important;
     }
 }
 
diff --git a/js/main.js b/js/main.js
index 33b7d6bd..408815db 100644
--- a/js/main.js
+++ b/js/main.js
@@ -442,12 +442,6 @@ function updateCssProperties() {
 
   // Check if the chat container is visible
   if (chatContainer.clientHeight > 0) {
-    const chatContainerParentHeight = chatContainer.parentNode.clientHeight;
-    const newChatHeight = `${chatContainerParentHeight - chatInputHeight - 80}px`;
-
-    document.documentElement.style.setProperty("--chat-height", newChatHeight);
-    document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`);
-
     // Adjust scrollTop based on input height change
     if (chatInputHeight !== currentChatInputHeight) {
       const deltaHeight = chatInputHeight - currentChatInputHeight;
@@ -720,7 +714,7 @@ function isMobile() {
 // Function to initialize sidebars
 function initializeSidebars() {
   const isOnMobile = isMobile();
-  
+
   if (isOnMobile) {
     // Mobile state: Hide sidebars and set closed states
     [pastChatsRow, chatControlsRow, headerBar].forEach(el => {
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 0d588549..0856cfab 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -46,8 +46,8 @@ def create_ui():
 
         with gr.Row():
             with gr.Column(elem_id='chat-col'):
-                shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
                 shared.gradio['display'] = gr.JSON(value={}, visible=False)  # Hidden buffer
+                shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
                 with gr.Row(elem_id="chat-input-row"):
                     with gr.Column(scale=1, elem_id='gr-hover-container'):
                         gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')

From d1866219261c5fa1e9d8c0a9c6c380b965ca7cc7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 4 May 2025 15:19:46 -0700
Subject: [PATCH 017/164] UI: Fixes after previous commit

---
 css/main.css | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 64e96ccc..f76a2787 100644
--- a/css/main.css
+++ b/css/main.css
@@ -787,6 +787,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 #chat-input-container {
     display: flex;
     flex-direction: column;
+    min-width: 0 !important;
 }
 
 #chat-input-container > .form {
@@ -807,12 +808,13 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     display: flex;
     flex-direction: column;
     padding-bottom: 0;
+    gap: 0;
 }
 
 @media screen and (width <= 924px) {
     #chat-col {
-        height: calc(100dvh - 132px);
         margin-top: 32px;
+        height: calc(100dvh - 32px);
     }
 }
 

From 84ab1f95bedd2433cec165502375901fdaa56a98 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 4 May 2025 15:21:52 -0700
Subject: [PATCH 018/164] UI: Increase the chat area a bit

---
 css/main.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/css/main.css b/css/main.css
index f76a2787..d5d5e771 100644
--- a/css/main.css
+++ b/css/main.css
@@ -797,6 +797,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 #chat-input-row {
     padding: 1rem;
+    padding-top: 0;
 }
 
 #chat-input-row.bigchat {

From d9da16edba88b09dcbfe97a7be302c65ed244ebb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 4 May 2025 16:53:52 -0700
Subject: [PATCH 019/164] UI: Remove the chat input textarea border

---
 css/main.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/css/main.css b/css/main.css
index d5d5e771..b3e699fa 100644
--- a/css/main.css
+++ b/css/main.css
@@ -581,6 +581,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 #chat-input textarea {
     padding: 0.65rem 2.5rem;
+    border: 0;
 }
 
 #chat-input textarea::placeholder {

From 690d693913f68d25d08fd74db902495766c12e5e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 4 May 2025 18:04:52 -0700
Subject: [PATCH 020/164] UI: Add padding to only show the last message/reply
 after sending a message

To avoid scrolling
---
 css/chat_style-Dark.css            |  3 ++-
 css/chat_style-TheEncrypted777.css |  3 ++-
 css/chat_style-cai-chat-square.css |  3 ++-
 css/chat_style-cai-chat.css        |  3 ++-
 css/chat_style-messenger.css       |  3 ++-
 css/chat_style-wpp.css             |  3 ++-
 css/html_instruct_style.css        |  4 ----
 css/main.css                       |  4 ----
 js/main.js                         | 10 ++++++++++
 modules/html_generator.py          |  9 +++------
 10 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/css/chat_style-Dark.css b/css/chat_style-Dark.css
index 3b2bd385..1ad46bc0 100644
--- a/css/chat_style-Dark.css
+++ b/css/chat_style-Dark.css
@@ -2,7 +2,8 @@
     display: grid;
     align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
-    padding-bottom: 28px;
+    padding-bottom: 14px;
+    padding-top: 14px;
     font-size: 18px;
     font-family: Roboto, Arial, sans-serif; /* Modern font */
     line-height: 1.5;
diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css
index 25d26db8..9e1230b7 100644
--- a/css/chat_style-TheEncrypted777.css
+++ b/css/chat_style-TheEncrypted777.css
@@ -4,7 +4,8 @@
     display: grid;
     align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
-    padding-bottom: 28px;
+    padding-bottom: 14px;
+    padding-top: 14px;
     font-size: 18px;
     font-family: 'Noto Sans', Arial, sans-serif;
     line-height: 1.428571429;
diff --git a/css/chat_style-cai-chat-square.css b/css/chat_style-cai-chat-square.css
index 854fff60..015f6927 100644
--- a/css/chat_style-cai-chat-square.css
+++ b/css/chat_style-cai-chat-square.css
@@ -16,6 +16,7 @@
 }
 
 .message {
-    padding-bottom: 2em;
+    padding-bottom: 1em;
+    padding-top: 1em;
     grid-template-columns: 70px minmax(0, 1fr);
 }
diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css
index 223f6150..0e91101f 100644
--- a/css/chat_style-cai-chat.css
+++ b/css/chat_style-cai-chat.css
@@ -2,7 +2,8 @@
     display: grid;
     align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
-    padding-bottom: 2em;
+    padding-bottom: 1em;
+    padding-top: 1em;
     font-size: 15px;
     font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
     line-height: 22.5px !important;
diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css
index f0fd1578..6518d6ca 100644
--- a/css/chat_style-messenger.css
+++ b/css/chat_style-messenger.css
@@ -1,5 +1,6 @@
 .message {
-    padding-bottom: 25px;
+    padding-bottom: 12.5px;
+    padding-top: 12.5px;
     font-size: 15px;
     font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
     line-height: 1.428571429;
diff --git a/css/chat_style-wpp.css b/css/chat_style-wpp.css
index 30ca61f3..1442dd0a 100644
--- a/css/chat_style-wpp.css
+++ b/css/chat_style-wpp.css
@@ -1,5 +1,6 @@
 .message {
-    padding-bottom: 25px;
+    padding-bottom: 12.5px;
+    padding-top: 12.5px;
     font-size: 15px;
     font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
     line-height: 1.428571429;
diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index b98544a1..f4339311 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -8,10 +8,6 @@
     padding-top: 0 !important;
 }
 
-.chat > .messages > :last-child {
-    margin-bottom: 1.7rem !important;
-}
-
 .chat .message-body p, .chat .message-body li {
     font-size: 1rem !important;
     line-height: 28px !important;
diff --git a/css/main.css b/css/main.css
index b3e699fa..9915735d 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1375,7 +1375,3 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     50% { opacity: 1; }
     100% { opacity: 0.6; }
 }
-
-.streaming {
-    min-height: 70vh;
-}
diff --git a/js/main.js b/js/main.js
index 408815db..e6611788 100644
--- a/js/main.js
+++ b/js/main.js
@@ -150,6 +150,16 @@ const observer = new MutationObserver(function(mutations) {
   if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) {
     targetElement.scrollTop = targetElement.scrollHeight;
   }
+
+  const chatElement = document.getElementById("chat");
+  if (chatElement) {
+    const messagesContainer = chatElement.querySelector(".messages");
+    const lastChild = messagesContainer?.lastElementChild;
+    const prevSibling = lastChild?.previousElementSibling;
+    if (lastChild && prevSibling) {
+      lastChild.style.minHeight = `calc(100vh - ${prevSibling.offsetHeight}px - 102px)`;
+    }
+  }
 });
 
 // Configure the observer to watch for changes in the subtree and attributes
diff --git a/modules/html_generator.py b/modules/html_generator.py
index a6f5f930..67d15b6e 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -365,9 +365,8 @@ def generate_instruct_html(history):
                 f'</div>'
             )
 
-        streaming_class = " streaming" if i == len(history["visible"]) - 1 else ""
         output += (
-            f'<div class="assistant-message{streaming_class}" '
+            f'<div class="assistant-message" '
             f'data-raw="{html.escape(row_internal[1], quote=True)}">'
             f'<div class="text">'
             f'<div class="message-body">{converted_visible[1]}</div>'
@@ -415,9 +414,8 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
                 f'</div>'
             )
 
-        streaming_class = " streaming" if i == len(history["visible"]) - 1 else ""
         output += (
-            f'<div class="message{streaming_class}" '
+            f'<div class="message" '
             f'data-raw="{html.escape(row_internal[1], quote=True)}">'
             f'<div class="circle-bot">{img_bot}</div>'
             f'<div class="text">'
@@ -454,9 +452,8 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
                 f'</div>'
             )
 
-        streaming_class = " streaming" if i == len(history["visible"]) - 1 else ""
         output += (
-            f'<div class="message{streaming_class}" '
+            f'<div class="message" '
             f'data-raw="{html.escape(row_internal[1], quote=True)}">'
             f'<div class="text-bot">'
             f'<div class="message-body">{converted_visible[1]}</div>'

From 2da197bba4b1547d51086e312d877d942e810be2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 4 May 2025 18:29:05 -0700
Subject: [PATCH 021/164] Refinement after previous commit

---
 js/main.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/js/main.js b/js/main.js
index e6611788..205cf88e 100644
--- a/js/main.js
+++ b/js/main.js
@@ -157,7 +157,7 @@ const observer = new MutationObserver(function(mutations) {
     const lastChild = messagesContainer?.lastElementChild;
     const prevSibling = lastChild?.previousElementSibling;
     if (lastChild && prevSibling) {
-      lastChild.style.minHeight = `calc(100vh - ${prevSibling.offsetHeight}px - 102px)`;
+      lastChild.style.minHeight = `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px))`;
     }
   }
 });

From d0211afb3c513bde0d8662bd686ddb0dc87354cd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 4 May 2025 18:52:01 -0700
Subject: [PATCH 022/164] Save the chat history right after sending a message

---
 modules/chat.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index 98913d5c..feac6bdd 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -483,6 +483,8 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
     history = state['history']
     for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)):
         yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
+        if i == 0:
+            save_history(history, state['unique_id'], state['character_menu'], state['mode'])
 
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
 

From df7bb0db1fe6478d037debf73272b10cef1f75c7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 4 May 2025 20:03:55 -0700
Subject: [PATCH 023/164] Rename --n-gpu-layers to --gpu-layers

---
 modules/shared.py        | 2 +-
 modules/ui_model_menu.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index cfedb992..b952c4a1 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -120,7 +120,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa
 group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
 group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
 group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
-group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
+group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=0, metavar='N', help='Number of layers to offload to the GPU.')
 group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
 group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
 group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 2c593df6..943645cf 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -49,7 +49,7 @@ def create_ui():
                 with gr.Blocks():
                     with gr.Row():
                         with gr.Column():
-                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
+                            shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
                             shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                             shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)

From f3da45f65d76f8c48fd95678ecc841afb0ddd04e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 4 May 2025 20:37:15 -0700
Subject: [PATCH 024/164] ExLlamaV3_HF: Change max_chunk_size to 256

---
 modules/exllamav3_hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index 12b22f64..417df473 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -119,7 +119,7 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
         reset = True
 
         # Maximum number of tokens to process in a single forward pass
-        max_chunk_size = 2048
+        max_chunk_size = 256
 
         # Make the forward call
         if labels is None:

From b817bb33fd7b26a24c81798dabb36af4620d4a53 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 May 2025 04:54:25 -0700
Subject: [PATCH 025/164] Minor fix after
 df7bb0db1fe6478d037debf73272b10cef1f75c7

---
 modules/llama_cpp_server.py | 2 +-
 modules/ui_model_menu.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 7244001a..0ddb3fff 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -256,7 +256,7 @@ class LlamaServer:
             self.server_path,
             "--model", self.model_path,
             "--ctx-size", str(shared.args.ctx_size),
-            "--n-gpu-layers", str(shared.args.n_gpu_layers),
+            "--gpu-layers", str(shared.args.gpu_layers),
             "--batch-size", str(shared.args.batch_size),
             "--port", str(self.port),
         ]
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 943645cf..e05d2256 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -49,7 +49,7 @@ def create_ui():
                 with gr.Blocks():
                     with gr.Row():
                         with gr.Column():
-                            shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
+                            shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
                             shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                             shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)

From 475e012ee8e0cbeb53fade01359cd649b9b5d470 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 May 2025 06:16:11 -0700
Subject: [PATCH 026/164] UI: Improve the light theme colors

---
 css/main.css  | 3 ++-
 modules/ui.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/css/main.css b/css/main.css
index 9915735d..b1d6e345 100644
--- a/css/main.css
+++ b/css/main.css
@@ -2,7 +2,7 @@
     --darker-gray: #202123;
     --dark-gray: #343541;
     --light-gray: #444654;
-    --light-theme-gray: #f5f5f5;
+    --light-theme-gray: #f3f4f6;
     --border-color-dark: #525252;
     --header-width: 112px;
     --selected-item-color-dark: #32333e;
@@ -580,6 +580,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-input textarea {
+    background: var(--light-theme-gray);
     padding: 0.65rem 2.5rem;
     border: 0;
 }
diff --git a/modules/ui.py b/modules/ui.py
index fb016f87..d08c1435 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -61,7 +61,7 @@ if not shared.args.old_colors:
         background_fill_primary_dark='var(--darker-gray)',
         body_background_fill="white",
         block_background_fill="transparent",
-        body_text_color="#333",
+        body_text_color='rgb(64, 64, 64)',
         button_secondary_background_fill="#f4f4f4",
         button_secondary_border_color="var(--border-color-primary)",
 

From 6001d279c64d92c1f2a312142e41119807694729 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 May 2025 07:42:13 -0700
Subject: [PATCH 027/164] Light theme improvement

---
 css/main.css | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index b1d6e345..38585a1c 100644
--- a/css/main.css
+++ b/css/main.css
@@ -979,6 +979,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     cursor: pointer;
 }
 
+#past-chats .selected,
+#past-chats label:hover {
+    background-color: rgb(224, 224, 224) !important;
+}
+
 #past-chats-buttons,
 #delete-chat-row,
 #rename-row {
@@ -987,7 +992,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     gap: 9px;
 }
 
-
 #past-chats-row,
 #chat-controls {
     width: 260px;

From 967b70327ea10a9c5cc7c932583993687b9d4ba7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 May 2025 07:56:47 -0700
Subject: [PATCH 028/164] Light theme improvement

---
 css/html_instruct_style.css |  2 +-
 css/main.css                | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index f4339311..fb984338 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -42,7 +42,7 @@
 }
 
 .chat .user-message {
-    background: #f5f5f5;
+    background: #f3f4f6;
     padding: 1.5rem 1rem;
     padding-bottom: 2rem;
     border-radius: 0;
diff --git a/css/main.css b/css/main.css
index 38585a1c..d6a0d220 100644
--- a/css/main.css
+++ b/css/main.css
@@ -2,7 +2,7 @@
     --darker-gray: #202123;
     --dark-gray: #343541;
     --light-gray: #444654;
-    --light-theme-gray: #f3f4f6;
+    --light-theme-gray: #f9fbff;
     --border-color-dark: #525252;
     --header-width: 112px;
     --selected-item-color-dark: #32333e;
@@ -580,7 +580,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-input textarea {
-    background: var(--light-theme-gray);
+    background: #f3f4f6;
     padding: 0.65rem 2.5rem;
     border: 0;
 }
@@ -981,7 +981,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 #past-chats .selected,
 #past-chats label:hover {
-    background-color: rgb(224, 224, 224) !important;
+    background-color: #dbeafe !important;
 }
 
 #past-chats-buttons,
@@ -1123,8 +1123,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     border: 0 !important;
 }
 
-.dark #past-chats .selected,
-.dark #past-chats label:hover {
+.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats .selected,
+.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats label:hover {
     background-color: var(--selected-item-color-dark) !important;
 }
 
@@ -1161,7 +1161,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 .header_bar button.selected {
-    background: #E0E0E0;
+    background: #dbeafe;
 }
 
 #chat-controls,

From bf5290bc0ff15f6894a4eb5785e8df60831ecb25 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 May 2025 08:04:12 -0700
Subject: [PATCH 029/164] Fix the hover menu in light theme

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index d6a0d220..59165a62 100644
--- a/css/main.css
+++ b/css/main.css
@@ -742,7 +742,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 .hover-menu button {
     width: 100%;
-    background: transparent !important;
+    background: white !important;
     border-radius: 0 !important;
     justify-content: space-between;
     margin: 0 !important;

From 53d8e4650202f5891364197011098b3af34fe6ac Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 May 2025 12:28:17 -0700
Subject: [PATCH 030/164] Ensure environment isolation in portable installs

---
 start_linux.sh    | 9 +++++----
 start_macos.sh    | 9 +++++----
 start_windows.bat | 9 +++++----
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/start_linux.sh b/start_linux.sh
index c74f1272..e2b00558 100755
--- a/start_linux.sh
+++ b/start_linux.sh
@@ -1,5 +1,10 @@
 #!/usr/bin/env bash
 
+# environment isolation
+export PYTHONNOUSERSITE=1
+unset PYTHONPATH
+unset PYTHONHOME
+
 cd "$(dirname "${BASH_SOURCE[0]}")"
 
 # Portable install case
@@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
     exit
 fi
 
-# environment isolation
-export PYTHONNOUSERSITE=1
-unset PYTHONPATH
-unset PYTHONHOME
 export CUDA_PATH="$INSTALL_ENV_DIR"
 export CUDA_HOME="$CUDA_PATH"
 
diff --git a/start_macos.sh b/start_macos.sh
index 7a060ba6..bff11bc1 100755
--- a/start_macos.sh
+++ b/start_macos.sh
@@ -1,5 +1,10 @@
 #!/bin/bash
 
+# environment isolation
+export PYTHONNOUSERSITE=1
+unset PYTHONPATH
+unset PYTHONHOME
+
 cd "$(dirname "${BASH_SOURCE[0]}")"
 
 # Portable install case
@@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
     exit
 fi
 
-# environment isolation
-export PYTHONNOUSERSITE=1
-unset PYTHONPATH
-unset PYTHONHOME
 export CUDA_PATH="$INSTALL_ENV_DIR"
 export CUDA_HOME="$CUDA_PATH"
 
diff --git a/start_windows.bat b/start_windows.bat
index 1616ee27..f5e66ec2 100755
--- a/start_windows.bat
+++ b/start_windows.bat
@@ -1,6 +1,11 @@
 @echo off
 setlocal enabledelayedexpansion
 
+@rem environment isolation
+set PYTHONNOUSERSITE=1
+set PYTHONPATH=
+set PYTHONHOME=
+
 cd /D "%~dp0"
 
 @rem Portable install case
@@ -87,10 +92,6 @@ if not exist "%INSTALL_ENV_DIR%" (
 @rem check if conda environment was actually created
 if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
 
-@rem environment isolation
-set PYTHONNOUSERSITE=1
-set PYTHONPATH=
-set PYTHONHOME=
 set "CUDA_PATH=%INSTALL_ENV_DIR%"
 set "CUDA_HOME=%CUDA_PATH%"
 

From 8137eb8ef46ac6950cb96094e3cc30b0a72dee76 Mon Sep 17 00:00:00 2001
From: mamei16 <marcel.1710@live.de>
Date: Mon, 5 May 2025 23:05:23 +0200
Subject: [PATCH 031/164] Dynamic Chat Message UI Update Speed (#6952)

---
 modules/shared.py                |  1 -
 modules/text_generation.py       | 18 ++++++++----------
 modules/ui.py                    |  1 -
 modules/ui_parameters.py         |  2 --
 user_data/settings-template.yaml |  1 -
 5 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index b952c4a1..b4dfbfd1 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -47,7 +47,6 @@ settings = {
     'max_new_tokens_max': 4096,
     'prompt_lookup_num_tokens': 0,
     'max_tokens_second': 0,
-    'max_updates_second': 12,
     'auto_max_new_tokens': True,
     'ban_eos_token': False,
     'add_bos_token': True,
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 8fd65dc4..7e48a2f6 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -64,41 +64,39 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
             all_stop_strings += st
 
     shared.stop_everything = False
-    last_update = -1
     reply = ''
     is_stream = state['stream']
     if len(all_stop_strings) > 0 and not state['stream']:
         state = copy.deepcopy(state)
         state['stream'] = True
 
-    min_update_interval = 0
-    if state.get('max_updates_second', 0) > 0:
-        min_update_interval = 1 / state['max_updates_second']
-
     # Generate
+    last_update = -1
+    latency_threshold = 1 / 1000
     for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat):
+        cur_time = time.monotonic()
         reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
         if escape_html:
             reply = html.escape(reply)
 
         if is_stream:
-            cur_time = time.time()
-
             # Limit number of tokens/second to make text readable in real time
             if state['max_tokens_second'] > 0:
                 diff = 1 / state['max_tokens_second'] - (cur_time - last_update)
                 if diff > 0:
                     time.sleep(diff)
 
-                last_update = time.time()
+                last_update = time.monotonic()
                 yield reply
 
             # Limit updates to avoid lag in the Gradio UI
             # API updates are not limited
             else:
-                if cur_time - last_update > min_update_interval:
-                    last_update = cur_time
+                # If 'generate_func' takes less than 0.001 seconds to yield the next token
+                # (equivalent to more than 1000 tok/s), assume that the UI is lagging behind and skip yielding
+                if (cur_time - last_update) > latency_threshold:
                     yield reply
+                last_update = time.monotonic()
 
         if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything):
             break
diff --git a/modules/ui.py b/modules/ui.py
index d08c1435..b3d4bccf 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -192,7 +192,6 @@ def list_interface_input_elements():
         'max_new_tokens',
         'prompt_lookup_num_tokens',
         'max_tokens_second',
-        'max_updates_second',
         'do_sample',
         'dynamic_temperature',
         'temperature_last',
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 733d0901..84f9fbfc 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -71,8 +71,6 @@ def create_ui(default_preset):
                             shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label='max_new_tokens', info='⚠️ Setting this too high can cause prompt truncation.')
                             shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
                             shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
-                            shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')
-
                 with gr.Column():
                     with gr.Row():
                         with gr.Column():
diff --git a/user_data/settings-template.yaml b/user_data/settings-template.yaml
index ce0f77e1..db481e84 100644
--- a/user_data/settings-template.yaml
+++ b/user_data/settings-template.yaml
@@ -18,7 +18,6 @@ max_new_tokens_min: 1
 max_new_tokens_max: 4096
 prompt_lookup_num_tokens: 0
 max_tokens_second: 0
-max_updates_second: 12
 auto_max_new_tokens: true
 ban_eos_token: false
 add_bos_token: true

From 85bf2e15b98117ef5630e81bf4a002440fffe2c2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 May 2025 14:14:48 -0700
Subject: [PATCH 032/164] API: Remove obsolete multimodal extension handling

Multimodal support will be added back once it's implemented in llama-server.
---
 extensions/openai/completions.py | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 75e2cc11..46c76199 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -96,30 +96,6 @@ def convert_history(history):
     user_input_last = True
     system_message = ""
 
-    # Multimodal: convert OpenAI format to multimodal extension format
-    if any('content' in entry and isinstance(entry['content'], list) for entry in history):
-        new_history = []
-        for entry in history:
-            if isinstance(entry['content'], list):
-                for item in entry['content']:
-                    if not isinstance(item, dict):
-                        continue
-
-                    image_url = None
-                    content = None
-                    if item['type'] == 'image_url' and isinstance(item['image_url'], dict):
-                        image_url = item['image_url']['url']
-                    elif item['type'] == 'text' and isinstance(item['text'], str):
-                        content = item['text']
-                    if image_url:
-                        new_history.append({"image_url": image_url, "role": "user"})
-                    if content:
-                        new_history.append({"content": content, "role": "user"})
-            else:
-                new_history.append(entry)
-
-        history = new_history
-
     for entry in history:
         if "image_url" in entry:
             image_url = entry['image_url']

From f82667f0b4c0824420a6637efee3c680ddbe25f3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 May 2025 14:17:00 -0700
Subject: [PATCH 033/164] Remove more multimodal extension references

---
 extensions/openai/completions.py | 28 +---------------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 46c76199..a7d8b4e4 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -1,13 +1,8 @@
-import base64
 import copy
-import re
 import time
 from collections import deque
-from io import BytesIO
 
-import requests
 import tiktoken
-from PIL import Image
 
 from extensions.openai.errors import InvalidRequestError
 from extensions.openai.utils import debug_msg
@@ -97,28 +92,7 @@ def convert_history(history):
     system_message = ""
 
     for entry in history:
-        if "image_url" in entry:
-            image_url = entry['image_url']
-            if "base64" in image_url:
-                image_url = re.sub('^data:image/.+;base64,', '', image_url)
-                img = Image.open(BytesIO(base64.b64decode(image_url)))
-            else:
-                try:
-                    my_res = requests.get(image_url)
-                    img = Image.open(BytesIO(my_res.content))
-                except Exception:
-                    raise 'Image cannot be loaded from the URL!'
-
-            buffered = BytesIO()
-            if img.mode in ("RGBA", "P"):
-                img = img.convert("RGB")
-
-            img.save(buffered, format="JPEG")
-            img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
-            content = f'<img src="data:image/jpeg;base64,{img_str}">'
-        else:
-            content = entry["content"]
-
+        content = entry["content"]
         role = entry["role"]
 
         if role == "user":

From 941e0663da48345150ae77d7c6b6eb54e21d671d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 May 2025 14:18:05 -0700
Subject: [PATCH 034/164] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8a7b2467..6cc84c50 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ If you ever need to install something manually in the `installer_files` environm
 * There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root.
 * To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts.
 * For additional instructions about AMD and WSL setup, consult [the documentation](https://github.com/oobabooga/text-generation-webui/wiki).
-* For automated installation, you can use the `GPU_CHOICE`, `USE_CUDA118`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
+* For automated installation, you can use the `GPU_CHOICE`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
 
 ### Manual installation using Conda
 

From 987505ead345b0e113d636311f6a5faa4fcbe986 Mon Sep 17 00:00:00 2001
From: Evgenii Novikov <enovikov11@yandex.ru>
Date: Tue, 6 May 2025 00:03:33 +0200
Subject: [PATCH 035/164] docker: Fix app uid typo in cpu docker compose
 (#6957)

---
 docker/cpu/docker-compose.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/cpu/docker-compose.yml b/docker/cpu/docker-compose.yml
index c9d415ae..9aba314a 100644
--- a/docker/cpu/docker-compose.yml
+++ b/docker/cpu/docker-compose.yml
@@ -22,7 +22,7 @@ services:
         TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} 
         BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
         APP_GID: ${APP_GID:-6972} 
-        APP_UID: ${APP_UID-6972} 
+        APP_UID: ${APP_UID:-6972} 
     env_file: .env
     user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
     ports:

From 99bd66445f90df58a3f3832b35cca94dc397d1be Mon Sep 17 00:00:00 2001
From: Alireza Ghasemi <alirezaa.g@gmail.com>
Date: Tue, 6 May 2025 00:04:06 +0200
Subject: [PATCH 036/164] SuperboogaV2: minor update to avoid json
 serialization errors #6945

---
 extensions/superboogav2/chromadb.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/extensions/superboogav2/chromadb.py b/extensions/superboogav2/chromadb.py
index 6e93dd92..f4f77821 100644
--- a/extensions/superboogav2/chromadb.py
+++ b/extensions/superboogav2/chromadb.py
@@ -292,6 +292,8 @@ class ChromaCollector():
 
         for doc in documents:
             doc_tokens = encode(doc)[0]
+            if isinstance(doc_tokens, np.ndarray):
+                doc_tokens = doc_tokens.tolist()
             doc_token_count = len(doc_tokens)
             if current_token_count + doc_token_count > max_token_count:
                 # If adding this document would exceed the max token count,

From 76f947e3cf1c71e4105f708f02b2ca163a69987c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 May 2025 15:58:29 -0700
Subject: [PATCH 037/164] UI: Minor style change

---
 css/main.css | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/css/main.css b/css/main.css
index 59165a62..520ff972 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1380,3 +1380,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     50% { opacity: 1; }
     100% { opacity: 0.6; }
 }
+
+strong {
+    font-weight: bold;
+}

From 530223bf0b196257e41ec948c2e92e1c3e507e9f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 May 2025 16:00:49 -0700
Subject: [PATCH 038/164] UI: Fix the hover menu colors

---
 css/main.css | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/css/main.css b/css/main.css
index 520ff972..b8ba8256 100644
--- a/css/main.css
+++ b/css/main.css
@@ -761,6 +761,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     background: var(--button-secondary-background-fill-hover) !important;
 }
 
+.dark .hover-menu button:hover {
+    background: var(--selected-item-color-dark) !important;
+}
+
 .transparent-substring {
     opacity: 0.333;
 }
@@ -1109,12 +1113,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     color: #9ca3af;
 }
 
-.dark .hover-menu {
-    background-color: var(--darker-gray);
-}
-
 .dark .hover-menu button {
     border-color: var(--border-color-primary);
+    background-color: var(--darker-gray) !important;
 }
 
 .dark #chat-controls,

From 4e8f628d3c206e8362cea5b5f7557abe33351bc0 Mon Sep 17 00:00:00 2001
From: Evgenii Novikov <enovikov11@yandex.ru>
Date: Tue, 6 May 2025 01:05:15 +0200
Subject: [PATCH 039/164] docker: App uid typo in other docker composes (#6958)

---
 docker/amd/docker-compose.yml    | 2 +-
 docker/intel/docker-compose.yml  | 2 +-
 docker/nvidia/docker-compose.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/amd/docker-compose.yml b/docker/amd/docker-compose.yml
index 4709ae94..8866e9ed 100644
--- a/docker/amd/docker-compose.yml
+++ b/docker/amd/docker-compose.yml
@@ -22,7 +22,7 @@ services:
         TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} 
         BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
         APP_GID: ${APP_GID:-6972} 
-        APP_UID: ${APP_UID-6972} 
+        APP_UID: ${APP_UID:-6972} 
     env_file: .env
     user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
     ports:
diff --git a/docker/intel/docker-compose.yml b/docker/intel/docker-compose.yml
index 31e9dde0..78e06698 100644
--- a/docker/intel/docker-compose.yml
+++ b/docker/intel/docker-compose.yml
@@ -22,7 +22,7 @@ services:
         TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} 
         BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
         APP_GID: ${APP_GID:-6972} 
-        APP_UID: ${APP_UID-6972} 
+        APP_UID: ${APP_UID:-6972} 
     env_file: .env
     user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
     ports:
diff --git a/docker/nvidia/docker-compose.yml b/docker/nvidia/docker-compose.yml
index 835dd838..0392078e 100644
--- a/docker/nvidia/docker-compose.yml
+++ b/docker/nvidia/docker-compose.yml
@@ -22,7 +22,7 @@ services:
         TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} 
         BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
         APP_GID: ${APP_GID:-6972} 
-        APP_UID: ${APP_UID-6972} 
+        APP_UID: ${APP_UID:-6972} 
     env_file: .env
     user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
     ports:

From cbef35054cb598b033e17b0442e8dad2da6873c4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 5 May 2025 17:46:09 -0700
Subject: [PATCH 040/164] UI: CSS fix

---
 css/main.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/css/main.css b/css/main.css
index b8ba8256..746f1f9e 100644
--- a/css/main.css
+++ b/css/main.css
@@ -426,6 +426,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .chat > .messages {
     display: flex;
     flex-direction: column;
+    min-height: calc(100vh - 102px);
 }
 
 .chat > .messages > :first-child {

From d1c0154d664e51d1ee6ea82d9c0e799d96367d4a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 6 May 2025 06:38:39 -0700
Subject: [PATCH 041/164] llama.cpp: Add top_n_sigma, fix typical_p in sampler
 priority

---
 modules/llama_cpp_server.py | 5 ++++-
 modules/presets.py          | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 0ddb3fff..b9902cd7 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -66,6 +66,7 @@ class LlamaServer:
             "top_k": state["top_k"],
             "top_p": state["top_p"],
             "min_p": state["min_p"],
+            "top_n_sigma": state["top_n_sigma"] if state["top_n_sigma"] > 0 else -1,
             "tfs_z": state["tfs"],
             "typical_p": state["typical_p"],
             "repeat_penalty": state["repetition_penalty"],
@@ -102,8 +103,10 @@ class LlamaServer:
 
             penalty_found = False
             for s in samplers:
-                if s.strip() in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]:
+                if s.strip() in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc"]:
                     filtered_samplers.append(s.strip())
+                elif s.strip() == "typical_p":
+                    filtered_samplers.append("typ_p")
                 elif not penalty_found and s.strip() == "repetition_penalty":
                     filtered_samplers.append("penalties")
                     penalty_found = True
diff --git a/modules/presets.py b/modules/presets.py
index 50d0f985..5a9a5873 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -52,7 +52,7 @@ def default_preset():
 
     if shared.args.portable:
         samplers = result['sampler_priority'].split('\n')
-        samplers = [sampler for sampler in samplers if sampler in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature", "repetition_penalty"]]
+        samplers = [sampler for sampler in samplers if sampler in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc", "typical_p", "repetition_penalty"]]
         result['sampler_priority'] = '\n'.join(samplers)
 
     return result

From 89590adc14c941814c2d54795cfc78fab959d9e7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 6 May 2025 06:41:17 -0700
Subject: [PATCH 042/164] Update llama.cpp

---
 requirements/full/requirements.txt             |  4 ++--
 requirements/full/requirements_amd.txt         |  4 ++--
 requirements/full/requirements_amd_noavx2.txt  |  4 ++--
 requirements/full/requirements_apple_intel.txt |  4 ++--
 .../full/requirements_apple_silicon.txt        |  6 +++---
 requirements/full/requirements_cpu_only.txt    |  4 ++--
 .../full/requirements_cpu_only_noavx2.txt      |  4 ++--
 requirements/full/requirements_noavx2.txt      |  4 ++--
 requirements/portable/requirements.txt         |  4 ++--
 requirements/portable/requirements_amd.txt     | 18 ------------------
 .../portable/requirements_amd_noavx2.txt       | 18 ------------------
 .../portable/requirements_apple_intel.txt      |  4 ++--
 .../portable/requirements_apple_silicon.txt    |  6 +++---
 .../portable/requirements_cpu_only.txt         |  4 ++--
 .../portable/requirements_cpu_only_noavx2.txt  |  4 ++--
 requirements/portable/requirements_noavx2.txt  |  4 ++--
 requirements/portable/requirements_vulkan.txt  |  4 ++--
 .../portable/requirements_vulkan_noavx2.txt    |  4 ++--
 18 files changed, 34 insertions(+), 70 deletions(-)
 delete mode 100644 requirements/portable/requirements_amd.txt
 delete mode 100644 requirements/portable/requirements_amd_noavx2.txt

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index c0ace41b..a60ea7b4 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 24eeee6a..431cd740 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 99716f3c..0c581f86 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index cc747edb..f7213efe 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 67b3260e..4aac3dea 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -29,8 +29,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 47ad5759..ac277d61 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 334f11df..cc412d33 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 3575d352..78265f1a 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index c720daa7..1240d335 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
deleted file mode 100644
index 7d9c00c0..00000000
--- a/requirements/portable/requirements_amd.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-fastapi==0.112.4
-gradio==4.37.*
-jinja2==3.1.6
-markdown
-numpy==1.26.*
-pydantic==2.8.2
-pyyaml
-requests
-rich
-tqdm
-
-# API
-flask_cloudflared==0.0.14
-sse-starlette==1.6.5
-tiktoken
-
-# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt
deleted file mode 100644
index d718c1b1..00000000
--- a/requirements/portable/requirements_amd_noavx2.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-fastapi==0.112.4
-gradio==4.37.*
-jinja2==3.1.6
-markdown
-numpy==1.26.*
-pydantic==2.8.2
-pyyaml
-requests
-rich
-tqdm
-
-# API
-flask_cloudflared==0.0.14
-sse-starlette==1.6.5
-tiktoken
-
-# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 9e184b53..6b165b7c 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index ec059716..1b2b5cf2 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -15,6 +15,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index d473b824..2793d743 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index d3fffb43..6d7316a6 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index cdfa6a01..e56eba08 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 1a7ce6ed..a7f8c703 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 4737321d..5b427fd2 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 605cc9ab14533dd20cc11363f020fb9947cfb723 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 6 May 2025 06:42:15 -0700
Subject: [PATCH 043/164] Update exllamav3

---
 requirements/full/requirements.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_noavx2.txt        | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index a60ea7b4..3b50c674 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -32,8 +32,8 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index f7213efe..ba23ea9c 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -31,5 +31,5 @@ tiktoken
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 4aac3dea..c245ab74 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -32,5 +32,5 @@ tiktoken
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 78265f1a..d8bbf6d1 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -32,8 +32,8 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"

From 1927afe89457dce8eb805b2275ba7c8a9680a967 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 6 May 2025 08:18:49 -0700
Subject: [PATCH 044/164] Fix top_n_sigma not showing for llama.cpp

---
 modules/loaders.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/loaders.py b/modules/loaders.py
index 738198b1..217d569c 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -300,6 +300,7 @@ loaders_samplers = {
         'xtc_threshold',
         'xtc_probability',
         'tfs',
+        'top_n_sigma',
         'dry_multiplier',
         'dry_allowed_length',
         'dry_base',

From 05115e42ee1ab7a2848b883e469885ce9504f04a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 6 May 2025 08:27:21 -0700
Subject: [PATCH 045/164] Set top_n_sigma before temperature by default

---
 modules/presets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/presets.py b/modules/presets.py
index 5a9a5873..cf706605 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -46,7 +46,7 @@ def default_preset():
         'do_sample': True,
         'dynamic_temperature': False,
         'temperature_last': False,
-        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_n_sigma\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
+        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntop_n_sigma\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
         'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
     }
 

From c4f36db0d859e1819550e576a7fbd513c990c64d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 6 May 2025 08:41:13 -0700
Subject: [PATCH 046/164] llama.cpp: remove tfs (it doesn't get used)

---
 modules/llama_cpp_server.py | 1 -
 modules/loaders.py          | 1 -
 2 files changed, 2 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index b9902cd7..d8d2f61b 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -67,7 +67,6 @@ class LlamaServer:
             "top_p": state["top_p"],
             "min_p": state["min_p"],
             "top_n_sigma": state["top_n_sigma"] if state["top_n_sigma"] > 0 else -1,
-            "tfs_z": state["tfs"],
             "typical_p": state["typical_p"],
             "repeat_penalty": state["repetition_penalty"],
             "repeat_last_n": state["repetition_penalty_range"],
diff --git a/modules/loaders.py b/modules/loaders.py
index 217d569c..b29679bd 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -299,7 +299,6 @@ loaders_samplers = {
         'typical_p',
         'xtc_threshold',
         'xtc_probability',
-        'tfs',
         'top_n_sigma',
         'dry_multiplier',
         'dry_allowed_length',

From 5ef564a22e8df21a7480d5c8d6e32919f35f14c7 Mon Sep 17 00:00:00 2001
From: Downtown-Case <alphaatlas100@gmail.com>
Date: Tue, 6 May 2025 15:03:33 -0500
Subject: [PATCH 047/164] Fix model config loading in shared.py for Python 3.13
 (#6961)

---
 modules/shared.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index b4dfbfd1..6fd4604c 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -310,11 +310,13 @@ if args.api or args.public_api:
     add_extension('openai', last=True)
 
 # Load model-specific settings
-with Path(f'{args.model_dir}/config.yaml') as p:
-    if p.exists():
-        model_config = yaml.safe_load(open(p, 'r').read())
-    else:
-        model_config = {}
+p = Path(f'{args.model_dir}/config.yaml')
+if p.exists():
+    model_config = yaml.safe_load(open(p, 'r').read())
+else:
+    model_config = {}
+del p
+
 
 # Load custom model-specific settings
 user_config = load_user_config()

From e4fb2475d25e1dccfa39f5d943bcde61ef517245 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 6 May 2025 14:02:01 -0700
Subject: [PATCH 048/164] UI: Multiple small style improvements (light/dark
 themes)

---
 css/html_instruct_style.css |  2 +-
 css/main.css                | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index fb984338..6ad250aa 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -66,7 +66,7 @@
 
 .chat .user-message .text,
 .chat .assistant-message .text {
-    max-width: 645px;
+    max-width: 700px;
     margin-left: auto;
     margin-right: auto;
 }
diff --git a/css/main.css b/css/main.css
index 746f1f9e..30089aca 100644
--- a/css/main.css
+++ b/css/main.css
@@ -545,7 +545,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     border-radius: 5px;
     font-size: 82%;
     padding: 1px 3px;
-    background: white !important;
+    background: #f3f4f6 !important;
     color: #1f2328;
 }
 
@@ -559,18 +559,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     padding: 15px;
 }
 
-.message-body :not(pre) > code::before {
-    content: "`";
-}
-
-.message-body :not(pre) > code::after {
-    content: "`";
-}
-
 .message-body :not(pre) > code {
     white-space: normal !important;
     font-weight: bold;
-    font-family: unset;
+    font-size: 0.95em;
+    font-family: Menlo,"Roboto Mono","Courier New",Courier,monospace,Inter,sans-serif;
+    padding: .15rem .3rem;
+    background-color: #ececec;
+}
+
+.dark .message-body :not(pre) > code {
+    background-color: rgb(255 255 255 / 12.5%);
 }
 
 #chat-input {
@@ -584,6 +583,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     background: #f3f4f6;
     padding: 0.65rem 2.5rem;
     border: 0;
+    box-shadow: 0;
 }
 
 #chat-input textarea::placeholder {
@@ -759,7 +759,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 .hover-menu button:hover {
-    background: var(--button-secondary-background-fill-hover) !important;
+    background: #dbeafe !important;
 }
 
 .dark .hover-menu button:hover {

From b28fa86db6921adc8a42038f7062b72a27cb68b1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 6 May 2025 17:51:55 -0700
Subject: [PATCH 049/164] Default --gpu-layers to 256

---
 modules/shared.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/shared.py b/modules/shared.py
index 6fd4604c..f2698bd2 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -119,7 +119,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa
 group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
 group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
 group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
-group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=0, metavar='N', help='Number of layers to offload to the GPU.')
+group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
 group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
 group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
 group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')

From d2bae7694c0798f9f51bc61a1f7b20d93059f106 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 8 May 2025 07:26:23 -0700
Subject: [PATCH 050/164] UI: Change the ctx-size description

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index e05d2256..8dea457e 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -54,7 +54,7 @@ def create_ui():
                             shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                             shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
                             shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
-                            shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
+                            shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536, 131072.')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
                             shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')

From 348d4860c278eda1dedff15c05082e2d3358c3f3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 8 May 2025 07:58:59 -0700
Subject: [PATCH 051/164] UI: Create a "Main options" section in the Model tab

---
 modules/ui_model_menu.py | 70 ++++++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 31 deletions(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 8dea457e..28b7222d 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -47,52 +47,27 @@ def create_ui():
             with gr.Column():
                 shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None)
                 with gr.Blocks():
+                    gr.Markdown("## Main options")
                     with gr.Row():
                         with gr.Column():
                             shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
-                            shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
-                            shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
-                            shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
-                            shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
+                            shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536, 131072.')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
-                            shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
-                            shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
-                            shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
-                            shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
-                            shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
-                            shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
-                            shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
-                            shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
-                            shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
-                            shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
+                            shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
 
                         with gr.Column():
+                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
+                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
                             shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
-                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
-                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
-                            shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
-                            shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
-                            shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
-                            shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
-                            shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
-                            shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
-                            shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
-                            shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
-                            shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
                             shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
                             shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
-                            shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
-                            shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
-                            shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
-                            shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
                             shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
-                            shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
                             shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
                             shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
 
@@ -102,11 +77,44 @@ def create_ui():
                                     shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu)
                                     ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
 
-                                shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')
                                 shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
+                                shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding. Recommended value: 4.')
                                 shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
                                 shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
 
+                    gr.Markdown("## Other options")
+                    with gr.Accordion("See more options", open=False, elem_classes='tgw-accordion'):
+                        with gr.Row():
+                            with gr.Column():
+                                shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
+                                shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
+                                shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
+                                shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
+                                shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
+                                shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
+                                shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
+                                shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
+                                shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
+                                shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
+                                shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
+                                shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
+
+                            with gr.Column():
+                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
+                                shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
+                                shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
+                                shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
+                                shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
+                                shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
+                                shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
+                                shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
+                                shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
+                                shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
+                                shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
+                                shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
+                                shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
+                                shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
+
             with gr.Column():
                 with gr.Tab("Download"):
                     shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu)

From a2ab42d39099d89543a8e5c5753350e51905fa36 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 8 May 2025 08:00:38 -0700
Subject: [PATCH 052/164] UI: Remove the exllamav2 info message

---
 modules/loaders.py       | 1 -
 modules/ui_model_menu.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/modules/loaders.py b/modules/loaders.py
index b29679bd..4b76549b 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -84,7 +84,6 @@ loaders_and_params = OrderedDict({
         'no_flash_attn',
         'no_xformers',
         'no_sdpa',
-        'exllamav2_info',
         'model_draft',
         'draft_max',
         'ctx_size_draft',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 28b7222d..33e152a0 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -68,7 +68,6 @@ def create_ui():
                             shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
                             shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
-                            shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
                             shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
 
                             # Speculative decoding

From 13a434f3518e381d04acd869ed3c0ba3d3823d34 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 8 May 2025 08:06:07 -0700
Subject: [PATCH 053/164] Bump exllamav3

---
 requirements/full/requirements.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_noavx2.txt        | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 3b50c674..ac89f45b 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -32,8 +32,8 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index ba23ea9c..6abdb1a4 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -31,5 +31,5 @@ tiktoken
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7-py3-none-any.whl
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index c245ab74..682c6a47 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -32,5 +32,5 @@ tiktoken
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7-py3-none-any.whl
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index d8bbf6d1..1e185079 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -32,8 +32,8 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"

From ed6e16191da79523c5cabfd927130b307b3b54b9 Mon Sep 17 00:00:00 2001
From: Scott Z <phokur@users.noreply.github.com>
Date: Thu, 8 May 2025 11:21:52 -0400
Subject: [PATCH 054/164] Docker fix for NVIDIA (#6964)

---
 docker/nvidia/Dockerfile         |  2 +-
 docker/nvidia/docker-compose.yml | 12 +-----------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/docker/nvidia/Dockerfile b/docker/nvidia/Dockerfile
index 900a4329..82594a26 100644
--- a/docker/nvidia/Dockerfile
+++ b/docker/nvidia/Dockerfile
@@ -14,7 +14,7 @@ WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
 RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
-COPY CMD_FLAGS.txt /home/app/text-generation-webui/
+COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 WORKDIR /home/app/text-generation-webui
 # set umask to ensure group read / write at runtime
diff --git a/docker/nvidia/docker-compose.yml b/docker/nvidia/docker-compose.yml
index 0392078e..23d5cacc 100644
--- a/docker/nvidia/docker-compose.yml
+++ b/docker/nvidia/docker-compose.yml
@@ -31,17 +31,7 @@ services:
     stdin_open: true
     tty: true
     volumes:
-      - ./cache:/home/app/text-generation-webui/cache
-      - ./characters:/home/app/text-generation-webui/characters
-      - ./extensions:/home/app/text-generation-webui/extensions
-      - ./loras:/home/app/text-generation-webui/loras
-      - ./logs:/home/app/text-generation-webui/logs
-      - ./models:/home/app/text-generation-webui/models
-      - ./presets:/home/app/text-generation-webui/presets
-      - ./prompts:/home/app/text-generation-webui/prompts
-      - ./softprompts:/home/app/text-generation-webui/softprompts
-      - ./training:/home/app/text-generation-webui/training
-      - ./cloudflared:/etc/cloudflared
+      - ./user_data:/home/app/text-generation-webui/user_data
     deploy:
       resources:
         reservations:

From fa960496d554ece24c06088607692fa7b874ff5b Mon Sep 17 00:00:00 2001
From: Jonas <jkrauss82@gmail.com>
Date: Thu, 8 May 2025 17:30:27 +0200
Subject: [PATCH 055/164] Tools support for OpenAI compatible API (#6827)

---
 extensions/openai/completions.py | 73 +++++++++++++++++++++----
 extensions/openai/typing.py      | 47 +++++++++++++++-
 extensions/openai/utils.py       | 94 ++++++++++++++++++++++++++++++++
 modules/chat.py                  | 12 ++--
 4 files changed, 209 insertions(+), 17 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index a7d8b4e4..ed0bcc40 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -1,11 +1,14 @@
 import copy
 import time
+import json
 from collections import deque
 
 import tiktoken
 
 from extensions.openai.errors import InvalidRequestError
-from extensions.openai.utils import debug_msg
+from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
+from extensions.openai.typing import ToolDefinition
+from pydantic import ValidationError
 from modules import shared
 from modules.chat import (
     generate_chat_prompt,
@@ -99,19 +102,24 @@ def convert_history(history):
             user_input = content
             user_input_last = True
             if current_message:
-                chat_dialogue.append([current_message, ''])
+                chat_dialogue.append([current_message, '', ''])
                 current_message = ""
 
             current_message = content
         elif role == "assistant":
+            if "tool_calls" in entry and isinstance(entry["tool_calls"], list) and len(entry["tool_calls"]) > 0 and content.strip() == "":
+                continue  # skip tool calls
             current_reply = content
             user_input_last = False
             if current_message:
-                chat_dialogue.append([current_message, current_reply])
+                chat_dialogue.append([current_message, current_reply, ''])
                 current_message = ""
                 current_reply = ""
             else:
-                chat_dialogue.append(['', current_reply])
+                chat_dialogue.append(['', current_reply, ''])
+        elif role == "tool":
+            user_input_last = False
+            chat_dialogue.append(['', '', content])
         elif role == "system":
             system_message += f"\n{content}" if system_message else content
 
@@ -131,6 +139,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     if 'messages' not in body:
         raise InvalidRequestError(message="messages is required", param='messages')
 
+    tools = None
+    if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and len(body['tools']) > 0:
+        tools = validateTools(body['tools']) # raises InvalidRequestError if validation fails
+
     messages = body['messages']
     for m in messages:
         if 'role' not in m:
@@ -188,6 +200,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         'custom_system_message': custom_system_message,
         'chat_template_str': chat_template_str,
         'chat-instruct_command': chat_instruct_command,
+        'tools': tools,
         'history': history,
         'stream': stream
     })
@@ -200,7 +213,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     requested_model = generate_params.pop('model')
     logprob_proc = generate_params.pop('logprob_proc', None)
 
-    def chat_streaming_chunk(content):
+    def chat_streaming_chunk(content, chunk_tool_calls=None):
         # begin streaming
         chunk = {
             "id": cmpl_id,
@@ -210,7 +223,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             resp_list: [{
                 "index": 0,
                 "finish_reason": None,
-                "delta": {'role': 'assistant', 'content': content},
+                "delta": {'role': 'assistant', 'content': content, 'tool_calls': chunk_tool_calls},
             }],
         }
 
@@ -219,6 +232,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
         # else:
         #    chunk[resp_list][0]["logprobs"] = None
+
         return chunk
 
     # generate reply #######################################
@@ -227,8 +241,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         yield {'prompt': prompt}
         return
 
-    debug_msg({'prompt': prompt, 'generate_params': generate_params})
-
     if stream:
         yield chat_streaming_chunk('')
 
@@ -238,8 +250,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     answer = ''
     seen_content = ''
 
+    tool_calls = []
+    end_last_tool_call = 0
+    supported_tools = [x["function"]["name"] for x in tools] if tools is not None else None
+
     for a in generator:
         answer = a['internal'][-1][1]
+
+        if supported_tools is not None:
+            tool_call = parseToolCall(answer[end_last_tool_call:], supported_tools) if len(answer) > 0 else []
+            if len(tool_call) > 0:
+                for tc in tool_call:
+                    tc["id"] = getToolCallId()
+                    tc["index"] = str(len(tool_calls))
+                    tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"])
+                    tool_calls.append(tc)
+                end_last_tool_call = len(answer)
+
         if stream:
             len_seen = len(seen_content)
             new_content = answer[len_seen:]
@@ -247,18 +274,25 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             if not new_content or chr(0xfffd) in new_content:  # partial unicode character, don't send it yet.
                 continue
 
-            seen_content = answer
             chunk = chat_streaming_chunk(new_content)
+
+            seen_content = answer
             yield chunk
 
+        # stop generation if tool_calls were generated previously
+        if len(tool_calls) > 0:
+            break
+
     token_count = len(encode(prompt)[0])
     completion_token_count = len(encode(answer)[0])
     stop_reason = "stop"
+    if len(tool_calls) > 0:
+        stop_reason = "tool_calls"
     if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']:
         stop_reason = "length"
 
     if stream:
-        chunk = chat_streaming_chunk('')
+        chunk = chat_streaming_chunk('', tool_calls)
         chunk[resp_list][0]['finish_reason'] = stop_reason
         chunk['usage'] = {
             "prompt_tokens": token_count,
@@ -276,7 +310,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             resp_list: [{
                 "index": 0,
                 "finish_reason": stop_reason,
-                "message": {"role": "assistant", "content": answer}
+                "message": {"role": "assistant", "content": answer},
+                "tool_calls": tool_calls
             }],
             "usage": {
                 "prompt_tokens": token_count,
@@ -465,3 +500,19 @@ def completions(body: dict, is_legacy: bool = False) -> dict:
 def stream_completions(body: dict, is_legacy: bool = False):
     for resp in completions_common(body, is_legacy, stream=True):
         yield resp
+
+
+def validateTools(tools: list[dict]):
+    # Validate each tool definition in the JSON array
+    valid_tools = None
+    for idx in range(len(tools)):
+        tool = tools[idx]
+        try:
+            tool_definition = ToolDefinition(**tool)
+            if valid_tools is None:
+                valid_tools = []
+            valid_tools.append(tool)
+        except ValidationError:
+            raise InvalidRequestError(message=f"Invalid tool specification at index {idx}.", param='tools')
+
+    return valid_tools
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index b1979cbc..b28ebb4e 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -1,8 +1,8 @@
 import json
 import time
-from typing import Dict, List
+from typing import Dict, List, Optional
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, validator
 
 
 class GenerationOptions(BaseModel):
@@ -54,6 +54,48 @@ class GenerationOptions(BaseModel):
     grammar_string: str = ""
 
 
+class ToolDefinition(BaseModel):
+    function: 'ToolFunction'
+    type: str
+
+
+class ToolFunction(BaseModel):
+    description: str
+    name: str
+    parameters: 'ToolParameters'
+
+
+class ToolParameters(BaseModel):
+    properties: Optional[Dict[str, 'ToolProperty']] = None
+    required: Optional[list[str]] = None
+    type: str
+    description: Optional[str] = None
+
+
+class ToolProperty(BaseModel):
+    description: Optional[str] = None
+    type: Optional[str] = None  # we are faced with definitions like anyOf, e.g. {'type': 'function', 'function': {'name': 'git_create_branch', 'description': 'Creates a new branch from an optional base branch', 'parameters': {'type': 'object', 'properties': {'repo_path': {'title': 'Repo Path', 'type': 'string'}, 'branch_name': {'title': 'Branch Name', 'type': 'string'}, 'base_branch': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'default': None, 'title': 'Base Branch'}}, 'required': ['repo_path', 'branch_name'], 'title': 'GitCreateBranch'}}}
+
+
+class FunctionCall(BaseModel):
+    name: str
+    arguments: Optional[str] = None
+    parameters: Optional[str] = None
+
+    @validator('arguments', allow_reuse=True)
+    def checkPropertyArgsOrParams(cls, v, values, **kwargs):
+        if not v and not values.get('parameters'):
+            raise ValueError("At least one of 'arguments' or 'parameters' must be provided as property in FunctionCall type")
+        return v
+
+
+class ToolCall(BaseModel):
+    id: str
+    index: int
+    type: str
+    function: FunctionCall
+
+
 class CompletionRequestParams(BaseModel):
     model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.")
     prompt: str | List[str]
@@ -92,6 +134,7 @@ class ChatCompletionRequestParams(BaseModel):
     frequency_penalty: float | None = 0
     function_call: str | dict | None = Field(default=None, description="Unused parameter.")
     functions: List[dict] | None = Field(default=None, description="Unused parameter.")
+    tools: List[dict] | None = Field(default=None, description="Tools signatures passed via MCP.")
     logit_bias: dict | None = None
     max_tokens: int | None = None
     n: int | None = Field(default=1, description="Unused parameter.")
diff --git a/extensions/openai/utils.py b/extensions/openai/utils.py
index 2b414769..8cb856ff 100644
--- a/extensions/openai/utils.py
+++ b/extensions/openai/utils.py
@@ -1,6 +1,9 @@
 import base64
 import os
 import time
+import json
+import random
+import re
 import traceback
 from typing import Callable, Optional
 
@@ -52,3 +55,94 @@ def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_star
             time.sleep(3)
 
         raise Exception('Could not start cloudflared.')
+
+
+def getToolCallId() -> str:
+    letter_bytes = "abcdefghijklmnopqrstuvwxyz0123456789"
+    b = [random.choice(letter_bytes) for _ in range(8)]
+    return "call_" + "".join(b).lower()
+
+
+def checkAndSanitizeToolCallCandidate(candidate_dict: dict, tool_names: list[str]):
+    # check if property 'function' exists and is a dictionary, otherwise adapt dict
+    if 'function' not in candidate_dict and 'name' in candidate_dict and isinstance(candidate_dict['name'], str):
+        candidate_dict = {"type": "function", "function": candidate_dict}
+    if 'function' in candidate_dict and isinstance(candidate_dict['function'], str):
+        candidate_dict['name'] = candidate_dict['function']
+        del candidate_dict['function']
+        candidate_dict = {"type": "function", "function": candidate_dict}
+    if 'function' in candidate_dict and isinstance(candidate_dict['function'], dict):
+        # check if 'name' exists within 'function' and is part of known tools
+        if 'name' in candidate_dict['function'] and candidate_dict['function']['name'] in tool_names:
+            candidate_dict["type"] = "function"  # ensure required property 'type' exists and has the right value
+            # map property 'parameters' used by some older models to 'arguments'
+            if "arguments" not in candidate_dict["function"] and "parameters" in candidate_dict["function"]:
+                candidate_dict["function"]["arguments"] = candidate_dict["function"]["parameters"]
+                del candidate_dict["function"]["parameters"]
+            return candidate_dict
+    return None
+
+
+def parseToolCall(answer: str, tool_names: list[str]):
+    matches = []
+
+    # abort on very short answers to save computation cycles
+    if len(answer) < 10:
+        return matches
+
+    # Define the regex pattern to find the JSON content wrapped in <function>, <tools>, <tool_call>, and other tags observed from various models
+    patterns = [ r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)</\1>" ]
+
+    for pattern in patterns:
+        for match in re.finditer(pattern, answer, re.DOTALL):
+            # print(match.group(2))
+            if match.group(2) is None:
+                continue
+            # remove backtick wraps if present
+            candidate = re.sub(r"^```(json|xml|python[^\n]*)\n", "", match.group(2).strip())
+            candidate = re.sub(r"```$", "", candidate.strip())
+            # unwrap inner tags
+            candidate = re.sub(pattern, r"\2", candidate.strip(), flags=re.DOTALL)
+            # llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
+            if re.search(r"\}\s*\n\s*\{", candidate) is not None:
+                candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
+            if not candidate.strip().startswith("["):
+                candidate = "[" + candidate + "]"
+
+            candidates = []
+            try:
+                # parse the candidate JSON into a dictionary
+                candidates = json.loads(candidate)
+                if not isinstance(candidates, list):
+                    candidates = [candidates]
+            except json.JSONDecodeError:
+                # Ignore invalid JSON silently
+                continue
+
+            for candidate_dict in candidates:
+                checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
+                if checked_candidate is not None:
+                    matches.append(checked_candidate)
+
+        # last resort if nothing has been mapped: LLM might have produced plain json tool call without xml-like tags
+        if len(matches) == 0:
+            try:
+                candidate = answer
+                # llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
+                if re.search(r"\}\s*\n\s*\{", candidate) is not None:
+                    candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
+                if not candidate.strip().startswith("["):
+                    candidate = "[" + candidate + "]"
+                # parse the candidate JSON into a dictionary
+                candidates = json.loads(candidate)
+                if not isinstance(candidates, list):
+                    candidates = [candidates]
+                for candidate_dict in candidates:
+                    checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
+                    if checked_candidate is not None:
+                        matches.append(checked_candidate)
+            except json.JSONDecodeError:
+                # Ignore invalid JSON silently
+                pass
+
+    return matches
diff --git a/modules/chat.py b/modules/chat.py
index feac6bdd..b524b1b9 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -145,7 +145,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
     instruct_renderer = partial(
         instruction_template.render,
         builtin_tools=None,
-        tools=None,
+        tools=state['tools'] if 'tools' in state else None,
         tools_in_user_message=False,
         add_generation_prompt=False
     )
@@ -171,9 +171,13 @@ def generate_chat_prompt(user_input, state, **kwargs):
             messages.append({"role": "system", "content": context})
 
     insert_pos = len(messages)
-    for user_msg, assistant_msg in reversed(history):
-        user_msg = user_msg.strip()
-        assistant_msg = assistant_msg.strip()
+    for entry in reversed(history):
+        user_msg = entry[0].strip()
+        assistant_msg = entry[1].strip()
+        tool_msg = entry[2].strip() if len(entry) > 2 else ''
+
+        if tool_msg:
+            messages.insert(insert_pos, {"role": "tool", "content": tool_msg})
 
         if assistant_msg:
             messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})

From a1b3307b6636b13373e8f399690cb3b782854d2c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 8 May 2025 08:58:14 -0700
Subject: [PATCH 056/164] Bump llama.cpp

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 16 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index ac89f45b..3a059c91 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 431cd740..ebc33216 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 0c581f86..8ec6898f 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 6abdb1a4..afc869c8 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 682c6a47..8d7d29b7 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -29,8 +29,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index ac277d61..d69aae18 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index cc412d33..540c9ac8 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 1e185079..3bb5a74a 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 1240d335..95319d75 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 6b165b7c..4b49b4e1 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 1b2b5cf2..a6ebda30 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -15,6 +15,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 2793d743..bb5ba8ad 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 6d7316a6..3d17dd49 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index e56eba08..ff9fa04c 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index a7f8c703..e17f8ce7 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 5b427fd2..dd01b3a8 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 1c7209a725c8811f2d4d2325007b2e871c5af020 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 8 May 2025 09:46:43 -0700
Subject: [PATCH 057/164] Save the chat history periodically during streaming

---
 modules/chat.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index b524b1b9..403d05e1 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -5,6 +5,7 @@ import html
 import json
 import pprint
 import re
+import time
 from datetime import datetime
 from functools import partial
 from pathlib import Path
@@ -485,10 +486,16 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         send_dummy_reply(state['start_with'], state)
 
     history = state['history']
+    last_save_time = time.monotonic()
+    save_interval = 8
     for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)):
         yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
-        if i == 0:
+
+        current_time = time.monotonic()
+        # Save on first iteration or if save_interval seconds have passed
+        if i == 0 or (current_time - last_save_time) >= save_interval:
             save_history(history, state['unique_id'], state['character_menu'], state['mode'])
+            last_save_time = current_time
 
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
 

From 3bc2ec2b119c058446f9e9600213c75302a4ac4f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 8 May 2025 10:34:09 -0700
Subject: [PATCH 058/164] Fix #6965

---
 one_click.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/one_click.py b/one_click.py
index cb16b813..482a6aa9 100644
--- a/one_click.py
+++ b/one_click.py
@@ -126,7 +126,7 @@ def check_env():
         sys.exit(1)
 
     # Ensure this is a new environment and not the base environment
-    if os.environ["CONDA_DEFAULT_ENV"] == "base":
+    if os.environ.get("CONDA_DEFAULT_ENV", "") == "base":
         print("Create an environment for this project and activate it. Exiting...")
         sys.exit(1)
 

From 9ea2a69210ab5658ba8daf6d7d604589de5fc741 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 8 May 2025 10:41:25 -0700
Subject: [PATCH 059/164] llama.cpp: Add --no-webui to the llama-server command

---
 modules/llama_cpp_server.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index d8d2f61b..1046969a 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -261,6 +261,7 @@ class LlamaServer:
             "--gpu-layers", str(shared.args.gpu_layers),
             "--batch-size", str(shared.args.batch_size),
             "--port", str(self.port),
+            "--no-webui",
         ]
 
         if shared.args.flash_attn:

From bf7e4a4597b6492b4c440d32a8afbda59d4ef035 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 8 May 2025 16:12:07 -0700
Subject: [PATCH 060/164] Docs: Add a tool/function calling example (from
 https://github.com/oobabooga/text-generation-webui/pull/6827#issuecomment-2854716960)

---
 docs/12 - OpenAI API.md | 79 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 364c6b09..db9befed 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -257,6 +257,85 @@ headers = {
 
 in any of the examples above.
 
+#### Tool/Function Calling Example
+
+You need to use a model with tools support. The prompt will be automatically formatted using the model's Jinja2 template.
+
+Request:
+
+```
+curl http://127.0.0.1:5000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "What time is it currently in New York City?"
+      }
+    ],
+    "tools": [
+      {
+        "type": "function",
+        "function": {
+          "name": "get_current_time",
+          "description": "Get current time in a specific timezones",
+          "parameters": {
+            "type": "object",
+            "required": ["timezone"],
+            "properties": {
+              "timezone": {
+                "type": "string",
+                "description": "IANA timezone name (e.g., America/New_York, Europe/London). Use Europe/Berlin as local timezone if no timezone provided by the user."
+              }
+            }
+          }
+        }
+      }
+    ]
+  }'
+```
+
+Sample response:
+
+```
+{
+    "id": "chatcmpl-1746532051477984256",
+    "object": "chat.completion",
+    "created": 1746532051,
+    "model": "qwen2.5-coder-14b-instruct-q4_k_m.gguf",
+    "choices": [
+        {
+            "index": 0,
+            "finish_reason": "tool_calls",
+            "message": {
+                "role": "assistant",
+                "content": "```xml\n<function>\n{\n  \"name\": \"get_current_time\",\n  \"arguments\": {\n    \"timezone\": \"America/New_York\"\n  }\n}\n</function>\n```"
+            },
+            "tool_calls": [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_current_time",
+                        "arguments": "{\"timezone\": \"America/New_York\"}"
+                    },
+                    "id": "call_52ij07mh",
+                    "index": "0"
+                }
+            ]
+        }
+    ],
+    "usage": {
+        "prompt_tokens": 224,
+        "completion_tokens": 38,
+        "total_tokens": 262
+    }
+}
+```
+
 ### Environment variables
 
 The following environment variables can be used (they take precedence over everything else):

From f8ef6e09af5d2e28cf67d1eea165591e156ac9d2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 8 May 2025 18:19:04 -0700
Subject: [PATCH 061/164] UI: Make ctx-size a slider

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 33e152a0..d4d9b8b1 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -51,8 +51,8 @@ def create_ui():
                     with gr.Row():
                         with gr.Column():
                             shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
+                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
-                            shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536, 131072.')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
                             shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
 

From 512bc2d0e02bef2434370c2317bcf56e50f0513f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 8 May 2025 23:43:55 -0700
Subject: [PATCH 062/164] UI: Update some labels

---
 modules/ui_model_menu.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index d4d9b8b1..1e27255b 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -53,12 +53,12 @@ def create_ui():
                             shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
-                            shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
+                            shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
                             shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
 
                         with gr.Column():
-                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
-                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
+                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
+                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
                             shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')

From 2bde625d5716355b30fdd414c9b104812b101ed1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 9 May 2025 00:19:25 -0700
Subject: [PATCH 063/164] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6cc84c50..0833f9b0 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
 - Multiple sampling parameters and generation options for sophisticated text generation control.
 - Switch between different models easily in the UI without restarting, with fine control over settings.
-- OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
+- OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
 - 100% offline and private, with zero telemetry, external resources, or remote update requests.
 - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 

From 8984e95c671c262b1667805895d317a9ffe9cd0a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 9 May 2025 07:21:05 -0700
Subject: [PATCH 064/164] UI: More friendly message when no model is loaded

---
 modules/logits.py          |  7 ++++---
 modules/text_generation.py |  5 +++--
 modules/utils.py           | 14 ++++++++++++++
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/modules/logits.py b/modules/logits.py
index 32aef7ae..56a20572 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -7,6 +7,7 @@ from modules import models, shared
 from modules.logging_colors import logger
 from modules.models import load_model
 from modules.text_generation import generate_reply
+from modules.utils import check_model_loaded
 
 global_scores = None
 
@@ -33,9 +34,9 @@ def get_next_logits(*args, **kwargs):
 
 
 def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False):
-    if shared.model is None:
-        logger.error("No model is loaded! Select one in the Model tab.")
-        return 'Error: No model is loaded1 Select one in the Model tab.', previous
+    model_is_loaded, error_message = check_model_loaded()
+    if not model_is_loaded:
+        return error_message, previous
 
     # llama.cpp case
     if shared.model.__class__.__name__ == 'LlamaServer':
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 7e48a2f6..c0c0350d 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -14,6 +14,7 @@ from modules.callbacks import Iteratorize
 from modules.extensions import apply_extensions
 from modules.html_generator import generate_basic_html
 from modules.logging_colors import logger
+from modules.utils import check_model_loaded
 
 
 def generate_reply(*args, **kwargs):
@@ -34,8 +35,8 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
     # Find the appropriate generation function
     generate_func = apply_extensions('custom_generate_reply')
     if generate_func is None:
-        if shared.model_name == 'None' or shared.model is None:
-            logger.error("No model is loaded! Select one in the Model tab.")
+        model_is_loaded, error_message = check_model_loaded()
+        if not model_is_loaded:
             yield ''
             return
 
diff --git a/modules/utils.py b/modules/utils.py
index 77324139..0e390d08 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -72,6 +72,20 @@ def natural_keys(text):
     return [atoi(c) for c in re.split(r'(\d+)', text)]
 
 
+def check_model_loaded():
+    if shared.model_name == 'None' or shared.model is None:
+        if len(get_available_models()) <= 1:
+            error_msg = "No model is loaded.\n\nTo get started:\n1) Place a GGUF file in your user_data/models folder\n2) Go to the Model tab and select it"
+            logger.error(error_msg)
+            return False, error_msg
+        else:
+            error_msg = "No model is loaded. Please select one in the Model tab."
+            logger.error(error_msg)
+            return False, error_msg
+
+    return True, None
+
+
 def get_available_models():
     # Get all GGUF files
     gguf_files = get_available_ggufs()

From 4920981b140862f3b085f614b83269e6ac228605 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 9 May 2025 20:35:38 -0700
Subject: [PATCH 065/164] UI: Remove the typing cursor

---
 modules/chat.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 403d05e1..b83c4bfe 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -399,16 +399,13 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
         # Extract the reply
         if state['mode'] in ['chat', 'chat-instruct']:
-            visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply + '▍')
+            visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
         else:
-            visible_reply = reply + '▍'
+            visible_reply = reply
 
         visible_reply = html.escape(visible_reply)
 
         if shared.stop_everything:
-            if output['visible'][-1][1].endswith('▍'):
-                output['visible'][-1][1] = output['visible'][-1][1][:-1]
-
             output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
             yield output
             return
@@ -424,9 +421,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             if is_stream:
                 yield output
 
-    if output['visible'][-1][1].endswith('▍'):
-        output['visible'][-1][1] = output['visible'][-1][1][:-1]
-
     output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
     yield output
 

From 47d47585095da3a76988eabe52765a332a668d55 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 10 May 2025 17:46:00 -0700
Subject: [PATCH 066/164] Fix #6970

---
 modules/shared.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index f2698bd2..4e0a20db 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -128,9 +128,9 @@ group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to
 group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
 
 # Cache
-group = parser.add_argument_group('Context and cache management')
+group = parser.add_argument_group('Context and cache')
 group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
-group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
+group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
 
 # Speculative decoding
 group = parser.add_argument_group('Speculative decoding')
@@ -159,10 +159,6 @@ group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='B
 group = parser.add_argument_group('TensorRT-LLM')
 group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
 
-# Cache
-group = parser.add_argument_group('Cache')
-group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
-
 # DeepSpeed
 group = parser.add_argument_group('DeepSpeed')
 group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')

From 006a866079d4a719f4405efdbbe18c03e106c541 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 10 May 2025 17:55:48 -0700
Subject: [PATCH 067/164] Fix API failing to cancel streams (attempt), closes
 #6966

---
 extensions/openai/script.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index a995da9d..66f38501 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -118,6 +118,7 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
                 for resp in response:
                     disconnected = await request.is_disconnected()
                     if disconnected:
+                        stop_everything_event()
                         break
 
                     yield {"data": json.dumps(resp)}
@@ -141,6 +142,7 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
                 for resp in response:
                     disconnected = await request.is_disconnected()
                     if disconnected:
+                        stop_everything_event()
                         break
 
                     yield {"data": json.dumps(resp)}

From 0c5fa3728e8f0505692966f7a296e6561566c7bd Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 10 May 2025 19:12:40 -0700
Subject: [PATCH 068/164] Revert "Fix API failing to cancel streams (attempt),
 closes #6966"

This reverts commit 006a866079d4a719f4405efdbbe18c03e106c541.
---
 extensions/openai/script.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index 66f38501..a995da9d 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -118,7 +118,6 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
                 for resp in response:
                     disconnected = await request.is_disconnected()
                     if disconnected:
-                        stop_everything_event()
                         break
 
                     yield {"data": json.dumps(resp)}
@@ -142,7 +141,6 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
                 for resp in response:
                     disconnected = await request.is_disconnected()
                     if disconnected:
-                        stop_everything_event()
                         break
 
                     yield {"data": json.dumps(resp)}

From e7ac06c1694024594450437f3b899e32ab2ce6e4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 10 May 2025 19:20:04 -0700
Subject: [PATCH 069/164] New attempt

---
 modules/llama_cpp_server.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 1046969a..615f29ad 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -146,8 +146,9 @@ class LlamaServer:
             pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
             print()
 
-        # Make a direct request with streaming enabled using a context manager
-        with self.session.post(url, json=payload, stream=True) as response:
+        # Make a request with streaming enabled
+        response = self.session.post(url, json=payload, stream=True)
+        try:
             response.raise_for_status()  # Raise an exception for HTTP errors
 
             full_text = ""
@@ -185,6 +186,9 @@ class LlamaServer:
                     print(f"Problematic line: {line}")
                     continue
 
+        finally:
+            response.close()
+
     def generate(self, prompt, state):
         output = ""
         for output in self.generate_with_streaming(prompt, state):

From 62c774bf24d35a1ebdcdb9927f8a6c6ae3949c82 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 13 May 2025 06:42:25 -0700
Subject: [PATCH 070/164] Revert "New attempt"

This reverts commit e7ac06c1694024594450437f3b899e32ab2ce6e4.
---
 modules/llama_cpp_server.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 615f29ad..1046969a 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -146,9 +146,8 @@ class LlamaServer:
             pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
             print()
 
-        # Make a request with streaming enabled
-        response = self.session.post(url, json=payload, stream=True)
-        try:
+        # Make a direct request with streaming enabled using a context manager
+        with self.session.post(url, json=payload, stream=True) as response:
             response.raise_for_status()  # Raise an exception for HTTP errors
 
             full_text = ""
@@ -186,9 +185,6 @@ class LlamaServer:
                     print(f"Problematic line: {line}")
                     continue
 
-        finally:
-            response.close()
-
     def generate(self, prompt, state):
         output = ""
         for output in self.generate_with_streaming(prompt, state):

From c375b6941395454bef52d9ac0e102c0de3f4d3ee Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 13 May 2025 11:23:33 -0700
Subject: [PATCH 071/164] API: Fix llama.cpp generating after disconnect,
 improve disconnect detection, fix deadlock on simultaneous requests

---
 extensions/openai/script.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index a995da9d..2b4f274f 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -14,6 +14,7 @@ from fastapi.requests import Request
 from fastapi.responses import JSONResponse
 from pydub import AudioSegment
 from sse_starlette import EventSourceResponse
+from starlette.concurrency import iterate_in_threadpool
 
 import extensions.openai.completions as OAIcompletions
 import extensions.openai.images as OAIimages
@@ -115,7 +116,7 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
         async def generator():
             async with streaming_semaphore:
                 response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
-                for resp in response:
+                async for resp in iterate_in_threadpool(response):
                     disconnected = await request.is_disconnected()
                     if disconnected:
                         break
@@ -125,7 +126,12 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
         return EventSourceResponse(generator())  # SSE streaming
 
     else:
-        response = OAIcompletions.completions(to_dict(request_data), is_legacy=is_legacy)
+        response = await asyncio.to_thread(
+            OAIcompletions.completions,
+            to_dict(request_data),
+            is_legacy=is_legacy
+        )
+
         return JSONResponse(response)
 
 
@@ -138,7 +144,7 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
         async def generator():
             async with streaming_semaphore:
                 response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
-                for resp in response:
+                async for resp in iterate_in_threadpool(response):
                     disconnected = await request.is_disconnected()
                     if disconnected:
                         break
@@ -148,7 +154,12 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
         return EventSourceResponse(generator())  # SSE streaming
 
     else:
-        response = OAIcompletions.chat_completions(to_dict(request_data), is_legacy=is_legacy)
+        response = await asyncio.to_thread(
+            OAIcompletions.chat_completions,
+            to_dict(request_data),
+            is_legacy=is_legacy
+        )
+
         return JSONResponse(response)
 
 

From 3fa1a899aea3ff2700a20a8bc2da17202d3065e5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 13 May 2025 12:07:59 -0700
Subject: [PATCH 072/164] UI: Fix gpu-layers being ignored (closes #6973)

---
 modules/loaders.py         | 2 +-
 modules/models_settings.py | 2 +-
 modules/ui.py              | 2 +-
 modules/ui_model_menu.py   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/loaders.py b/modules/loaders.py
index 4b76549b..583b65c2 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -5,7 +5,7 @@ import gradio as gr
 
 loaders_and_params = OrderedDict({
     'llama.cpp': [
-        'n_gpu_layers',
+        'gpu_layers',
         'threads',
         'threads_batch',
         'batch_size',
diff --git a/modules/models_settings.py b/modules/models_settings.py
index ae589bb3..4418e3fb 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -67,7 +67,7 @@ def get_model_metadata(model):
             elif k.endswith('rope.scaling.factor'):
                 model_settings['compress_pos_emb'] = metadata[k]
             elif k.endswith('block_count'):
-                model_settings['n_gpu_layers'] = metadata[k] + 1
+                model_settings['gpu_layers'] = metadata[k] + 1
 
         if 'tokenizer.chat_template' in metadata:
             template = metadata['tokenizer.chat_template']
diff --git a/modules/ui.py b/modules/ui.py
index b3d4bccf..eeb6ce92 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -105,7 +105,7 @@ def list_model_elements():
         'filter_by_loader',
         'loader',
         'cpu_memory',
-        'n_gpu_layers',
+        'gpu_layers',
         'threads',
         'threads_batch',
         'batch_size',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 1e27255b..b63a127c 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -50,7 +50,7 @@ def create_ui():
                     gr.Markdown("## Main options")
                     with gr.Row():
                         with gr.Column():
-                            shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
+                            shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')

From 2826c60044a05f316510ef93546b5dbff59b3864 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 13 May 2025 14:45:46 -0700
Subject: [PATCH 073/164] Use logger for "Output generated in ..." messages

---
 modules/text_generation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index c0c0350d..00b9275a 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -470,7 +470,7 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
         t1 = time.time()
         original_tokens = len(original_input_ids[0])
         new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
-        print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
+        logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
         return
 
 
@@ -499,7 +499,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
         t1 = time.time()
         original_tokens = len(encode(original_question)[0])
         new_tokens = len(encode(original_question + reply)[0]) - original_tokens
-        print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {state["seed"]})')
+        logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {state["seed"]})')
         return
 
 

From 035cd3e2a906a6094d0f1f298df49c0152f1a2ee Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 13 May 2025 20:09:22 -0700
Subject: [PATCH 074/164] UI: Hide the extension install menu in portable
 builds

---
 modules/ui_session.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/modules/ui_session.py b/modules/ui_session.py
index 7cf9f6e6..a4eba667 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -23,11 +23,15 @@ def create_ui():
                         shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table')
 
             with gr.Column():
-                extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu)
-                extension_status = gr.Markdown()
+                if not shared.args.portable:
+                    extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu)
+                    extension_status = gr.Markdown()
+                else:
+                    pass
 
         shared.gradio['theme_state'] = gr.Textbox(visible=False, value='dark' if shared.settings['dark_theme'] else 'light')
-        extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False)
+        if not shared.args.portable:
+            extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False)
 
         # Reset interface event
         shared.gradio['reset_interface'].click(

From c4a715fd1e86e52e3350f8126847524b488a04e2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 13 May 2025 20:14:09 -0700
Subject: [PATCH 075/164] UI: Move the LoRA menu under "Other options"

---
 modules/ui_model_menu.py | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index b63a127c..81ad1a53 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -26,25 +26,12 @@ def create_ui():
         with gr.Row():
             with gr.Column():
                 with gr.Row():
-                    with gr.Column():
-                        with gr.Row():
-                            shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
-                            ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
-                            shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu)
-                            shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
-                            shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
+                    ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
+                    shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
 
-                    with gr.Column():
-                        if shared.args.portable:
-                            pass
-                        else:
-                            with gr.Row():
-                                shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
-                                ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
-                                shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
-
-        with gr.Row():
-            with gr.Column():
                 shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None)
                 with gr.Blocks():
                     gr.Markdown("## Main options")
@@ -113,6 +100,11 @@ def create_ui():
                                 shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
                                 shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
                                 shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
+                                with gr.Row():
+                                    shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
+                                    ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
+                                    shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
+
 
             with gr.Column():
                 with gr.Tab("Download"):

From 5534d01da0913d315709a6adacd075639a6cffec Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Fri, 16 May 2025 00:07:37 -0300
Subject: [PATCH 076/164] Estimate the VRAM for GGUF models + autoset
 `gpu-layers` (#6980)

---
 css/main.css                |  14 +++-
 modules/llama_cpp_server.py |   3 +
 modules/models.py           |   1 -
 modules/models_settings.py  | 151 +++++++++++++++++++++++++++++++++++-
 modules/ui_model_menu.py    |  16 +++-
 server.py                   |  12 +++
 6 files changed, 193 insertions(+), 4 deletions(-)

diff --git a/css/main.css b/css/main.css
index 30089aca..0902b184 100644
--- a/css/main.css
+++ b/css/main.css
@@ -569,7 +569,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 .dark .message-body :not(pre) > code {
-    background-color: rgb(255 255 255 / 12.5%);
+    background-color: rgb(255 255 255 / 10%);
 }
 
 #chat-input {
@@ -1386,3 +1386,15 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 strong {
     font-weight: bold;
 }
+
+.min.svelte-1ybaih5 {
+    min-height: 0;
+}
+
+#vram-info .value {
+    color: #008d00;
+}
+
+.dark #vram-info .value {
+    color: #07ff07;
+}
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 1046969a..3fc7a0cc 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -282,8 +282,10 @@ class LlamaServer:
             cmd.append("--no-kv-offload")
         if shared.args.row_split:
             cmd += ["--split-mode", "row"]
+        cache_type = "fp16"
         if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types:
             cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type]
+            cache_type = shared.args.cache_type
         if shared.args.compress_pos_emb != 1:
             cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
         if shared.args.rope_freq_base > 0:
@@ -343,6 +345,7 @@ class LlamaServer:
             print(' '.join(str(item) for item in cmd[1:]))
             print()
 
+        logger.info(f"Using gpu_layers={shared.args.gpu_layers} | ctx_size={shared.args.ctx_size} | cache_type={cache_type}")
         # Start the server with pipes for output
         self.process = subprocess.Popen(
             cmd,
diff --git a/modules/models.py b/modules/models.py
index d0b0402a..9ecee803 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -71,7 +71,6 @@ def llama_cpp_server_loader(model_name):
     else:
         model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
 
-    logger.info(f"llama.cpp weights detected: \"{model_file}\"")
     try:
         model = LlamaServer(model_file)
         return model, model
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 4418e3fb..a8e17594 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -1,7 +1,11 @@
+import functools
 import json
 import re
+import subprocess
+from math import exp
 from pathlib import Path
 
+import gradio as gr
 import yaml
 
 from modules import chat, loaders, metadata_gguf, shared, ui
@@ -216,7 +220,17 @@ def apply_model_settings_to_state(model, state):
 
     for k in model_settings:
         if k in state:
-            state[k] = model_settings[k]
+            if k == 'gpu_layers':
+                available_vram = get_nvidia_free_vram()
+                n_layers = model_settings[k]
+                if available_vram > 0:
+                    tolerance = 906
+                    while n_layers > 0 and estimate_vram(model, n_layers, state['ctx_size'], state['cache_type']) > available_vram - tolerance:
+                        n_layers -= 1
+
+                state[k] = gr.update(value=n_layers, maximum=model_settings[k])
+            else:
+                state[k] = model_settings[k]
 
     return state
 
@@ -277,3 +291,138 @@ def save_instruction_template(model, template):
         yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.")
     else:
         yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")
+
+
+@functools.lru_cache(maxsize=None)
+def get_gguf_metadata_cached(model_file):
+    return metadata_gguf.load_metadata(model_file)
+
+
+def get_model_size_mb(model_file: Path) -> float:
+    filename = model_file.name
+
+    # Check for multipart pattern
+    match = re.match(r'(.+)-\d+-of-\d+\.gguf$', filename)
+
+    if match:
+        # It's a multipart file, find all matching parts
+        base_pattern = match.group(1)
+        part_files = sorted(model_file.parent.glob(f'{base_pattern}-*-of-*.gguf'))
+        total_size = sum(p.stat().st_size for p in part_files)
+    else:
+        # Single part
+        total_size = model_file.stat().st_size
+
+    return total_size / (1024 ** 2)  # Return size in MB
+
+
+def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
+    model_file = Path(f'{shared.args.model_dir}/{gguf_file}')
+    metadata = get_gguf_metadata_cached(model_file)
+    size_in_mb = get_model_size_mb(model_file)
+
+    # Extract values from metadata
+    n_layers = None
+    n_kv_heads = None
+    embedding_dim = None
+    context_length = None
+    feed_forward_dim = None
+
+    for key, value in metadata.items():
+        if key.endswith('.block_count'):
+            n_layers = value
+        elif key.endswith('.attention.head_count_kv'):
+            n_kv_heads = value
+        elif key.endswith('.embedding_length'):
+            embedding_dim = value
+        elif key.endswith('.context_length'):
+            context_length = value
+        elif key.endswith('.feed_forward_length'):
+            feed_forward_dim = value
+
+    if gpu_layers > n_layers:
+        gpu_layers = n_layers
+
+    # Convert cache_type to numeric
+    if cache_type == 'q4_0':
+        cache_type = 4
+    elif cache_type == 'q8_0':
+        cache_type = 8
+    else:
+        cache_type = 16
+
+    # Derived features
+    size_per_layer = size_in_mb / max(n_layers, 1e-6)
+    context_per_layer = context_length / max(n_layers, 1e-6)
+    ffn_per_embedding = feed_forward_dim / max(embedding_dim, 1e-6)
+    kv_cache_factor = n_kv_heads * cache_type * ctx_size
+
+    # Helper function for smaller
+    def smaller(x, y):
+        return 1 if x < y else 0
+
+    # Calculate VRAM using the model
+    # Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
+    vram = (
+        (size_per_layer - 21.19195204848197)
+        * exp(0.0001047328491557063 * size_in_mb * smaller(ffn_per_embedding, 2.671096993407845))
+        + 0.0006621544775632052 * context_per_layer
+        + 3.34664386576376e-05 * kv_cache_factor
+    ) * (1.363306170123392 + gpu_layers) + 1255.163594536052
+
+    return vram
+
+
+def get_nvidia_free_vram():
+    """
+    Calculates the total free VRAM across all NVIDIA GPUs by parsing nvidia-smi output.
+
+    Returns:
+        int: The total free VRAM in MiB summed across all detected NVIDIA GPUs.
+             Returns -1 if nvidia-smi command fails (not found, error, etc.).
+             Returns 0 if nvidia-smi succeeds but no GPU memory info found.
+    """
+    try:
+        # Execute nvidia-smi command
+        result = subprocess.run(
+            ['nvidia-smi'],
+            capture_output=True,
+            text=True,
+            check=False
+        )
+
+        # Check if nvidia-smi returned an error
+        if result.returncode != 0:
+            return -1
+
+        # Parse the output for memory usage patterns
+        output = result.stdout
+
+        # Find memory usage like "XXXXMiB / YYYYMiB"
+        # Captures used and total memory for each GPU
+        matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output)
+
+        if not matches:
+            # No GPUs found in expected format
+            return 0
+
+        total_free_vram_mib = 0
+        for used_mem_str, total_mem_str in matches:
+            try:
+                used_mib = int(used_mem_str)
+                total_mib = int(total_mem_str)
+                total_free_vram_mib += (total_mib - used_mib)
+            except ValueError:
+                # Skip malformed entries
+                pass
+
+        return total_free_vram_mib
+
+    except FileNotFoundError:
+        raise
+        # nvidia-smi not found (likely no NVIDIA drivers installed)
+        return -1
+    except Exception:
+        raise
+        # Handle any other unexpected exceptions
+        return -1
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 81ad1a53..2353f39c 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -11,6 +11,7 @@ from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model
 from modules.models_settings import (
     apply_model_settings_to_state,
+    estimate_vram,
     get_model_metadata,
     save_instruction_template,
     save_model_settings,
@@ -44,6 +45,7 @@ def create_ui():
                             shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
 
                         with gr.Column():
+                            shared.gradio['vram_info'] = gr.HTML(value=lambda: estimate_vram_wrapper(shared.args.model, shared.args.gpu_layers, shared.args.ctx_size, shared.args.cache_type))
                             shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
@@ -105,7 +107,6 @@ def create_ui():
                                     ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
                                     shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
 
-
             with gr.Column():
                 with gr.Tab("Download"):
                     shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu)
@@ -148,6 +149,11 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
 
+    shared.gradio['model_menu'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
+    shared.gradio['gpu_layers'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
+    shared.gradio['ctx_size'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
+    shared.gradio['cache_type'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
+
     if not shared.args.portable:
         shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
 
@@ -275,6 +281,14 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
         yield traceback.format_exc().replace('\n', '\n\n')
 
 
+def estimate_vram_wrapper(model, gpu_layers, ctx_size, cache_type):
+    if model in ["None", None]:
+        return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
+
+    result = estimate_vram(model, gpu_layers, ctx_size, cache_type)
+    return f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{result:.0f} MiB</span>"
+
+
 def update_truncation_length(current_length, state):
     if 'loader' in state:
         if state['loader'].lower().startswith('exllama') or state['loader'] == 'llama.cpp':
diff --git a/server.py b/server.py
index b0b9e633..c35d65a8 100644
--- a/server.py
+++ b/server.py
@@ -49,8 +49,10 @@ from modules.extensions import apply_extensions
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model_if_idle
 from modules.models_settings import (
+    estimate_vram,
     get_fallback_settings,
     get_model_metadata,
+    get_nvidia_free_vram,
     update_model_parameters
 )
 from modules.shared import do_cmd_flags_warnings
@@ -248,6 +250,16 @@ if __name__ == "__main__":
         model_settings = get_model_metadata(model_name)
         update_model_parameters(model_settings, initial=True)  # hijack the command-line arguments
 
+        if 'gpu_layers' not in shared.provided_arguments:
+            available_vram = get_nvidia_free_vram()
+            if available_vram > 0:
+                n_layers = model_settings['gpu_layers']
+                tolerance = 906
+                while n_layers > 0 and estimate_vram(model_name, n_layers, shared.args.ctx_size, shared.args.cache_type) > available_vram - tolerance:
+                    n_layers -= 1
+
+            shared.args.gpu_layers = n_layers
+
         # Load the model
         shared.model, shared.tokenizer = load_model(model_name)
         if shared.args.lora:

From 041248cc9f321aa6ff2e706083cdb776e3bf8d21 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 15 May 2025 20:10:02 -0700
Subject: [PATCH 077/164] Update llama.cpp

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 16 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 3a059c91..45bb5c85 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index ebc33216..4e011989 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 8ec6898f..a3bd1350 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index afc869c8..a52f2d64 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 8d7d29b7..929b1d86 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -29,8 +29,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index d69aae18..bd7c4a4f 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 540c9ac8..b5aa1cf7 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 3bb5a74a..bc320c27 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 95319d75..79959398 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 4b49b4e1..ca16e4c7 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index a6ebda30..18e1c506 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -15,6 +15,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index bb5ba8ad..693f4712 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 3d17dd49..8635d11e 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index ff9fa04c..e844596e 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index e17f8ce7..9b7435d1 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index dd01b3a8..513b7a15 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 8cb73b78e14154040ac4a2f7dd33dc7d46121108 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 15 May 2025 20:10:34 -0700
Subject: [PATCH 078/164] Update ExLlamaV3

---
 requirements/full/requirements.txt               | 4 ++--
 requirements/full/requirements_apple_intel.txt   | 2 +-
 requirements/full/requirements_apple_silicon.txt | 2 +-
 requirements/full/requirements_noavx2.txt        | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 45bb5c85..af5f7d8a 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -32,8 +32,8 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index a52f2d64..363365bf 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -31,5 +31,5 @@ tiktoken
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 929b1d86..2843fed2 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -32,5 +32,5 @@ tiktoken
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index bc320c27..89947cbe 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -32,8 +32,8 @@ tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"

From fd612979330ee0009ccbb14ac5bff894b675bb82 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 15 May 2025 21:19:19 -0700
Subject: [PATCH 079/164] Lint

---
 extensions/openai/completions.py    | 8 ++++----
 extensions/openai/utils.py          | 6 +++---
 extensions/superboogav2/chromadb.py | 3 ++-
 modules/tensorrt_llm.py             | 6 +++---
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index ed0bcc40..5181b18b 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -1,14 +1,14 @@
 import copy
-import time
 import json
+import time
 from collections import deque
 
 import tiktoken
+from pydantic import ValidationError
 
 from extensions.openai.errors import InvalidRequestError
-from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
 from extensions.openai.typing import ToolDefinition
-from pydantic import ValidationError
+from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
 from modules import shared
 from modules.chat import (
     generate_chat_prompt,
@@ -141,7 +141,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
 
     tools = None
     if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and len(body['tools']) > 0:
-        tools = validateTools(body['tools']) # raises InvalidRequestError if validation fails
+        tools = validateTools(body['tools'])  # raises InvalidRequestError if validation fails
 
     messages = body['messages']
     for m in messages:
diff --git a/extensions/openai/utils.py b/extensions/openai/utils.py
index 8cb856ff..9a1de2e7 100644
--- a/extensions/openai/utils.py
+++ b/extensions/openai/utils.py
@@ -1,9 +1,9 @@
 import base64
-import os
-import time
 import json
+import os
 import random
 import re
+import time
 import traceback
 from typing import Callable, Optional
 
@@ -91,7 +91,7 @@ def parseToolCall(answer: str, tool_names: list[str]):
         return matches
 
     # Define the regex pattern to find the JSON content wrapped in <function>, <tools>, <tool_call>, and other tags observed from various models
-    patterns = [ r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)</\1>" ]
+    patterns = [r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)</\1>"]
 
     for pattern in patterns:
         for match in re.finditer(pattern, answer, re.DOTALL):
diff --git a/extensions/superboogav2/chromadb.py b/extensions/superboogav2/chromadb.py
index f4f77821..9344e25c 100644
--- a/extensions/superboogav2/chromadb.py
+++ b/extensions/superboogav2/chromadb.py
@@ -1,10 +1,11 @@
 import math
 import random
 import threading
-import torch
+
 import chromadb
 import numpy as np
 import posthog
+import torch
 from chromadb.config import Settings
 from chromadb.utils import embedding_functions
 
diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py
index 73178c39..0527d493 100644
--- a/modules/tensorrt_llm.py
+++ b/modules/tensorrt_llm.py
@@ -1,15 +1,15 @@
 from pathlib import Path
 
-import torch
-
 import tensorrt_llm
+import torch
+from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
+
 from modules import shared
 from modules.logging_colors import logger
 from modules.text_generation import (
     get_max_prompt_length,
     get_reply_from_output_ids
 )
-from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
 
 
 class TensorRTLLMModel:

From cbf4daf1c8d149206da80892ced0220cf858ebb7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 15 May 2025 21:21:54 -0700
Subject: [PATCH 080/164] Hide the LoRA menu in portable mode

---
 modules/ui_model_menu.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 2353f39c..a1911124 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -102,10 +102,11 @@ def create_ui():
                                 shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
                                 shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
                                 shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
-                                with gr.Row():
-                                    shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
-                                    ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
-                                    shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
+                                if not shared.args.portable:
+                                    with gr.Row():
+                                        shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
+                                        ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
+                                        shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
 
             with gr.Column():
                 with gr.Tab("Download"):

From 93e1850a2c1eef8fe914bd020dde3e94d6b54f6c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 15 May 2025 21:42:15 -0700
Subject: [PATCH 081/164] Only show the VRAM info for llama.cpp

---
 modules/loaders.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/loaders.py b/modules/loaders.py
index 583b65c2..79a7a4a3 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -28,6 +28,7 @@ loaders_and_params = OrderedDict({
         'device_draft',
         'ctx_size_draft',
         'speculative_decoding_accordion',
+        'vram_info',
     ],
     'Transformers': [
         'gpu_split',

From 4925c307cfc97c1ca549b71db6f1aaaf82fd4fb2 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 09:07:38 -0700
Subject: [PATCH 082/164] Auto-adjust GPU layers on context size and cache type
 changes + many fixes

---
 modules/models_settings.py | 78 +++++++++++++++++++++++++++++++-------
 modules/ui_model_menu.py   | 46 ++++++++++++++--------
 server.py                  | 23 ++++++-----
 3 files changed, 109 insertions(+), 38 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index a8e17594..6ea6660c 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -213,24 +213,26 @@ def apply_model_settings_to_state(model, state):
     model_settings = get_model_metadata(model)
     if 'loader' in model_settings:
         loader = model_settings.pop('loader')
-
-        # If the user is using an alternative loader for the same model type, let them keep using it
         if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
             state['loader'] = loader
 
     for k in model_settings:
-        if k in state:
-            if k == 'gpu_layers':
-                available_vram = get_nvidia_free_vram()
-                n_layers = model_settings[k]
-                if available_vram > 0:
-                    tolerance = 906
-                    while n_layers > 0 and estimate_vram(model, n_layers, state['ctx_size'], state['cache_type']) > available_vram - tolerance:
-                        n_layers -= 1
+        if k in state and k != 'gpu_layers':  # Skip gpu_layers, handle separately
+            state[k] = model_settings[k]
 
-                state[k] = gr.update(value=n_layers, maximum=model_settings[k])
-            else:
-                state[k] = model_settings[k]
+    # Handle GPU layers and VRAM update for llama.cpp
+    if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings:
+        vram_info, gpu_layers_update = update_gpu_layers_and_vram(
+            state['loader'],
+            model,
+            model_settings['gpu_layers'],
+            state['ctx_size'],
+            state['cache_type'],
+            auto_adjust=True
+        )
+
+        state['gpu_layers'] = gpu_layers_update
+        state['vram_info'] = vram_info
 
     return state
 
@@ -426,3 +428,53 @@ def get_nvidia_free_vram():
         raise
         # Handle any other unexpected exceptions
         return -1
+
+
+def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True):
+    """
+    Unified function to handle GPU layers and VRAM updates.
+
+    Args:
+        for_ui: If True, returns Gradio updates. If False, returns raw values.
+
+    Returns:
+        - If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update
+        - If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage
+    """
+    if loader != 'llama.cpp' or model in ["None", None]:
+        vram_info = "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
+        if for_ui:
+            return (vram_info, gr.update()) if auto_adjust else vram_info
+        else:
+            return (0, gpu_layers) if auto_adjust else 0
+
+    current_layers = gpu_layers
+    max_layers = gpu_layers
+
+    if auto_adjust:
+        # Get max layers from model metadata
+        model_settings = get_model_metadata(model)
+        max_layers = model_settings.get('gpu_layers', gpu_layers)
+
+        # Auto-adjust based on available VRAM
+        available_vram = get_nvidia_free_vram()
+        if available_vram > 0:
+            tolerance = 906
+            current_layers = max_layers
+            while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
+                current_layers -= 1
+
+    # Calculate VRAM with current layers
+    vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type)
+
+    if for_ui:
+        vram_info = f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{vram_usage:.0f} MiB</span>"
+        if auto_adjust:
+            return vram_info, gr.update(value=current_layers, maximum=max_layers)
+        else:
+            return vram_info
+    else:
+        if auto_adjust:
+            return vram_usage, current_layers
+        else:
+            return vram_usage
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index a1911124..b6febb50 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -11,10 +11,10 @@ from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model
 from modules.models_settings import (
     apply_model_settings_to_state,
-    estimate_vram,
     get_model_metadata,
     save_instruction_template,
     save_model_settings,
+    update_gpu_layers_and_vram,
     update_model_parameters
 )
 from modules.utils import gradio
@@ -45,7 +45,7 @@ def create_ui():
                             shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
 
                         with gr.Column():
-                            shared.gradio['vram_info'] = gr.HTML(value=lambda: estimate_vram_wrapper(shared.args.model, shared.args.gpu_layers, shared.args.ctx_size, shared.args.cache_type))
+                            shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
                             shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
                             shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
@@ -150,10 +150,18 @@ def create_event_handlers():
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
 
-    shared.gradio['model_menu'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
-    shared.gradio['gpu_layers'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
-    shared.gradio['ctx_size'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
-    shared.gradio['cache_type'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
+    # For ctx_size and cache_type - auto-adjust GPU layers
+    for param in ['ctx_size', 'cache_type']:
+        shared.gradio[param].change(
+            partial(update_gpu_layers_and_vram, auto_adjust=True),
+            gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
+            gradio('vram_info', 'gpu_layers'), show_progress=False)
+
+    # For manual gpu_layers changes - only update VRAM
+    shared.gradio['gpu_layers'].change(
+        partial(update_gpu_layers_and_vram, auto_adjust=False),
+        gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
+        gradio('vram_info'), show_progress=False)
 
     if not shared.args.portable:
         shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
@@ -282,14 +290,6 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
         yield traceback.format_exc().replace('\n', '\n\n')
 
 
-def estimate_vram_wrapper(model, gpu_layers, ctx_size, cache_type):
-    if model in ["None", None]:
-        return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
-
-    result = estimate_vram(model, gpu_layers, ctx_size, cache_type)
-    return f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{result:.0f} MiB</span>"
-
-
 def update_truncation_length(current_length, state):
     if 'loader' in state:
         if state['loader'].lower().startswith('exllama') or state['loader'] == 'llama.cpp':
@@ -298,10 +298,26 @@ def update_truncation_length(current_length, state):
     return current_length
 
 
+def get_initial_vram_info():
+    if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
+        return update_gpu_layers_and_vram(
+            shared.args.loader,
+            shared.model_name,
+            shared.args.gpu_layers,
+            shared.args.ctx_size,
+            shared.args.cache_type,
+            auto_adjust=False,
+            for_ui=True
+        )
+
+    return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
+
+
 def handle_load_model_event_initial(model, state):
     state = apply_model_settings_to_state(model, state)
     output = ui.apply_interface_values(state)
-    update_model_parameters(state)
+    update_model_parameters(state)  # This updates the command-line flags
+
     return output + [state]
 
 
diff --git a/server.py b/server.py
index c35d65a8..c22ed1f1 100644
--- a/server.py
+++ b/server.py
@@ -49,10 +49,9 @@ from modules.extensions import apply_extensions
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, unload_model_if_idle
 from modules.models_settings import (
-    estimate_vram,
     get_fallback_settings,
     get_model_metadata,
-    get_nvidia_free_vram,
+    update_gpu_layers_and_vram,
     update_model_parameters
 )
 from modules.shared import do_cmd_flags_warnings
@@ -250,15 +249,19 @@ if __name__ == "__main__":
         model_settings = get_model_metadata(model_name)
         update_model_parameters(model_settings, initial=True)  # hijack the command-line arguments
 
-        if 'gpu_layers' not in shared.provided_arguments:
-            available_vram = get_nvidia_free_vram()
-            if available_vram > 0:
-                n_layers = model_settings['gpu_layers']
-                tolerance = 906
-                while n_layers > 0 and estimate_vram(model_name, n_layers, shared.args.ctx_size, shared.args.cache_type) > available_vram - tolerance:
-                    n_layers -= 1
+        # Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
+        if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
+            vram_usage, adjusted_layers = update_gpu_layers_and_vram(
+                shared.args.loader,
+                model_name,
+                model_settings['gpu_layers'],
+                shared.args.ctx_size,
+                shared.args.cache_type,
+                auto_adjust=True,
+                for_ui=False
+            )
 
-            shared.args.gpu_layers = n_layers
+            shared.args.gpu_layers = adjusted_layers
 
         # Load the model
         shared.model, shared.tokenizer = load_model(model_name)

From ee7b3028acaa38399272e78bb05272a420e72f05 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 09:12:36 -0700
Subject: [PATCH 083/164] Always cache GGUF metadata calls

---
 modules/models_settings.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 6ea6660c..8ecd2267 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -58,7 +58,7 @@ def get_model_metadata(model):
         else:
             model_file = list(path.glob('*.gguf'))[0]
 
-        metadata = metadata_gguf.load_metadata(model_file)
+        metadata = load_gguf_metadata_with_cache(model_file)
 
         for k in metadata:
             if k.endswith('context_length'):
@@ -295,8 +295,8 @@ def save_instruction_template(model, template):
         yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")
 
 
-@functools.lru_cache(maxsize=None)
-def get_gguf_metadata_cached(model_file):
+@functools.lru_cache(maxsize=1)
+def load_gguf_metadata_with_cache(model_file):
     return metadata_gguf.load_metadata(model_file)
 
 
@@ -320,7 +320,7 @@ def get_model_size_mb(model_file: Path) -> float:
 
 def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
     model_file = Path(f'{shared.args.model_dir}/{gguf_file}')
-    metadata = get_gguf_metadata_cached(model_file)
+    metadata = load_gguf_metadata_with_cache(model_file)
     size_in_mb = get_model_size_mb(model_file)
 
     # Extract values from metadata

From 9ec9b1bf837a995af8f203c8d05897510ab77c3d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 09:56:23 -0700
Subject: [PATCH 084/164] Auto-adjust GPU layers after model unload to utilize
 freed VRAM

---
 modules/ui_model_menu.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index b6febb50..39c57bf3 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -145,7 +145,9 @@ def create_event_handlers():
         partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
         handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
 
-    shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False)
+    shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False).then(
+        partial(update_gpu_layers_and_vram, auto_adjust=True), gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info', 'gpu_layers'), show_progress=False)
+
     shared.gradio['save_model_settings'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)

From 253e85a519219385668aabeabc82633c8e734ff9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 10:02:30 -0700
Subject: [PATCH 085/164] Only compute VRAM/GPU layers for llama.cpp models

---
 modules/models_settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 8ecd2267..0eb179d7 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -441,7 +441,7 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type,
         - If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update
         - If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage
     """
-    if loader != 'llama.cpp' or model in ["None", None]:
+    if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"):
         vram_info = "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
         if for_ui:
             return (vram_info, gr.update()) if auto_adjust else vram_info

From 38c50087feb11af41fed7a944ab0d7ef45a3bc44 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 11:55:06 -0700
Subject: [PATCH 086/164] Prevent a crash on systems without an NVIDIA GPU

---
 modules/models_settings.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 0eb179d7..3fdf3c84 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -421,11 +421,9 @@ def get_nvidia_free_vram():
         return total_free_vram_mib
 
     except FileNotFoundError:
-        raise
         # nvidia-smi not found (likely no NVIDIA drivers installed)
         return -1
     except Exception:
-        raise
         # Handle any other unexpected exceptions
         return -1
 

From fc483650b5e8c4933ac20b647cb822cf45856596 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 11:58:17 -0700
Subject: [PATCH 087/164] Set the maximum gpu_layers value automatically when
 the model is loaded with --model

---
 modules/ui_model_menu.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 39c57bf3..cd101c4a 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -38,7 +38,7 @@ def create_ui():
                     gr.Markdown("## Main options")
                     with gr.Row():
                         with gr.Column():
-                            shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
+                            shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
@@ -315,6 +315,14 @@ def get_initial_vram_info():
     return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
 
 
+def get_initial_gpu_layers_max():
+    if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
+        model_settings = get_model_metadata(shared.model_name)
+        return model_settings.get('gpu_layers', 256)
+
+    return 256
+
+
 def handle_load_model_event_initial(model, state):
     state = apply_model_settings_to_state(model, state)
     output = ui.apply_interface_values(state)

From adb975a380b219bbe14bbd7a19c83eaebc15cd55 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 12:52:43 -0700
Subject: [PATCH 088/164] Prevent fractional gpu-layers in the UI

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index cd101c4a..59bb6759 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -38,7 +38,7 @@ def create_ui():
                     gr.Markdown("## Main options")
                     with gr.Row():
                         with gr.Column():
-                            shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
+                            shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')

From 470c822f44dce2269dfaa8e3b37989195982b975 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 12:54:39 -0700
Subject: [PATCH 089/164] API: Hide the uvicorn access logs from the terminal

---
 extensions/openai/script.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index 2b4f274f..2c98ee78 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -447,7 +447,7 @@ def run_server():
 
     # Start server
     logging.getLogger("uvicorn.error").propagate = False
-    uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
+    uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False)
 
 
 def setup():

From e4d3f4449d75ea1b1f7f3438dbed8c910a970cec Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 13:02:27 -0700
Subject: [PATCH 090/164] API: Fix a regression

---
 modules/llama_cpp_server.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index 3fc7a0cc..d695c74e 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -146,8 +146,9 @@ class LlamaServer:
             pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
             print()
 
-        # Make a direct request with streaming enabled using a context manager
-        with self.session.post(url, json=payload, stream=True) as response:
+        # Make the generation request
+        response = self.session.post(url, json=payload, stream=True)
+        try:
             response.raise_for_status()  # Raise an exception for HTTP errors
 
             full_text = ""
@@ -184,6 +185,8 @@ class LlamaServer:
                     print(f"JSON decode error: {e}")
                     print(f"Problematic line: {line}")
                     continue
+        finally:
+            response.close()
 
     def generate(self, prompt, state):
         output = ""

From 1c549d176b27233daf0ef6992bf5b5d8215784f9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 17:24:06 -0700
Subject: [PATCH 091/164] Fix GPU layers slider: honor saved settings and show
 true maximum

---
 modules/models_settings.py | 30 +++++++++++++++++++++---------
 modules/ui_model_menu.py   |  2 +-
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 3fdf3c84..6715d494 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -72,6 +72,7 @@ def get_model_metadata(model):
                 model_settings['compress_pos_emb'] = metadata[k]
             elif k.endswith('block_count'):
                 model_settings['gpu_layers'] = metadata[k] + 1
+                model_settings['max_gpu_layers'] = metadata[k] + 1
 
         if 'tokenizer.chat_template' in metadata:
             template = metadata['tokenizer.chat_template']
@@ -450,17 +451,28 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type,
     max_layers = gpu_layers
 
     if auto_adjust:
-        # Get max layers from model metadata
+        # Get model settings including user preferences
         model_settings = get_model_metadata(model)
-        max_layers = model_settings.get('gpu_layers', gpu_layers)
 
-        # Auto-adjust based on available VRAM
-        available_vram = get_nvidia_free_vram()
-        if available_vram > 0:
-            tolerance = 906
-            current_layers = max_layers
-            while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
-                current_layers -= 1
+        # Check if the value is from user config-user.yaml
+        user_config = shared.user_config
+        model_regex = Path(model).name + '$'
+        has_user_setting = model_regex in user_config and 'gpu_layers' in user_config[model_regex]
+
+        if has_user_setting:
+            # Just return the current user value without adjustment
+            max_layers = model_settings.get('max_gpu_layers', 256)
+        else:
+            # No user setting, use model's max and auto-adjust
+            max_layers = model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', gpu_layers))
+            current_layers = max_layers  # Start from max
+
+            # Auto-adjust based on available VRAM
+            available_vram = get_nvidia_free_vram()
+            if available_vram > 0:
+                tolerance = 906
+                while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
+                    current_layers -= 1
 
     # Calculate VRAM with current layers
     vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type)
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 59bb6759..5b7dfdd8 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -318,7 +318,7 @@ def get_initial_vram_info():
 def get_initial_gpu_layers_max():
     if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
         model_settings = get_model_metadata(shared.model_name)
-        return model_settings.get('gpu_layers', 256)
+        return model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', 256))
 
     return 256
 

From d99fb0a22a44dc4fb4d695647ba07cbf55e044c6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 17:29:18 -0700
Subject: [PATCH 092/164] Add backward compatibility with saved n_gpu_layers
 values

---
 modules/models_settings.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 6715d494..76bce7a9 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -154,6 +154,9 @@ def get_model_metadata(model):
     for pat in settings:
         if re.match(pat.lower(), Path(model).name.lower()):
             for k in settings[pat]:
+                if k == 'n_gpu_layers':
+                    k = 'gpu_layers'
+
                 model_settings[k] = settings[pat][k]
 
     # Load instruction template if defined by name rather than by value

From 71fa046c1708a235853c359ef95b363a20c762d3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 17:38:08 -0700
Subject: [PATCH 093/164] Minor changes after
 1c549d176b27233daf0ef6992bf5b5d8215784f9

---
 modules/models_settings.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 76bce7a9..3a2400d4 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -457,17 +457,20 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type,
         # Get model settings including user preferences
         model_settings = get_model_metadata(model)
 
-        # Check if the value is from user config-user.yaml
+        # Get the true maximum layers
+        max_layers = model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', gpu_layers))
+
+        # Check if this is a user-saved setting
         user_config = shared.user_config
         model_regex = Path(model).name + '$'
         has_user_setting = model_regex in user_config and 'gpu_layers' in user_config[model_regex]
 
         if has_user_setting:
-            # Just return the current user value without adjustment
-            max_layers = model_settings.get('max_gpu_layers', 256)
+            # For user settings, just use the current value (which already has user pref)
+            # but ensure the slider maximum is correct
+            current_layers = gpu_layers  # Already has user setting
         else:
-            # No user setting, use model's max and auto-adjust
-            max_layers = model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', gpu_layers))
+            # No user setting, auto-adjust from the maximum
             current_layers = max_layers  # Start from max
 
             # Auto-adjust based on available VRAM

From e3bba510d443a0a447f85083a2dff4a116a50848 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 17:48:54 -0700
Subject: [PATCH 094/164] UI: Only add a blank space to streaming messages in
 instruct mode

---
 css/main.css              | 2 +-
 js/main.js                | 2 +-
 modules/html_generator.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/css/main.css b/css/main.css
index 0902b184..3fec7bb0 100644
--- a/css/main.css
+++ b/css/main.css
@@ -390,7 +390,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     margin-left: auto;
     margin-right: auto;
     flex: 1;
-    overflow-y: auto;
+    overflow-y: hidden;
     display: flex;
     flex-direction: column;
     word-break: break-word;
diff --git a/js/main.js b/js/main.js
index 205cf88e..6cecd341 100644
--- a/js/main.js
+++ b/js/main.js
@@ -152,7 +152,7 @@ const observer = new MutationObserver(function(mutations) {
   }
 
   const chatElement = document.getElementById("chat");
-  if (chatElement) {
+  if (chatElement && chatElement.getAttribute("data-mode") === "instruct") {
     const messagesContainer = chatElement.querySelector(".messages");
     const lastChild = messagesContainer?.lastElementChild;
     const prevSibling = lastChild?.previousElementSibling;
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 67d15b6e..39659476 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -347,7 +347,7 @@ remove_button = f'<button class="footer-button footer-remove-button" title="Remo
 
 
 def generate_instruct_html(history):
-    output = f'<style>{instruct_css}</style><div class="chat" id="chat"><div class="messages">'
+    output = f'<style>{instruct_css}</style><div class="chat" id="chat" data-mode="instruct"><div class="messages">'
 
     for i in range(len(history['visible'])):
         row_visible = history['visible'][i]

From c0e295dd1d8f06ea889e8a4f8ec68ca755c3727b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 17:53:20 -0700
Subject: [PATCH 095/164] Remove the 'None' option from the model menu

---
 modules/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/utils.py b/modules/utils.py
index 0e390d08..0e8bdd18 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -137,7 +137,7 @@ def get_available_models():
 
     model_dirs = sorted(model_dirs, key=natural_keys)
 
-    return ['None'] + filtered_gguf_files + model_dirs
+    return filtered_gguf_files + model_dirs
 
 
 def get_available_ggufs():

From 4bf763e1d94e91c1d016b597df642f076c06f849 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 18:22:43 -0700
Subject: [PATCH 096/164] Multiple small CSS fixes

---
 css/chat_style-Dark.css            | 4 ++--
 css/chat_style-TheEncrypted777.css | 4 ++--
 css/chat_style-cai-chat-square.css | 4 ++--
 css/chat_style-cai-chat.css        | 4 ++--
 css/chat_style-messenger.css       | 4 ++--
 css/chat_style-wpp.css             | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/css/chat_style-Dark.css b/css/chat_style-Dark.css
index 1ad46bc0..6a4784cc 100644
--- a/css/chat_style-Dark.css
+++ b/css/chat_style-Dark.css
@@ -2,8 +2,8 @@
     display: grid;
     align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
-    padding-bottom: 14px;
-    padding-top: 14px;
+    padding-bottom: 22px;
+    padding-top: 6px;
     font-size: 18px;
     font-family: Roboto, Arial, sans-serif; /* Modern font */
     line-height: 1.5;
diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css
index 9e1230b7..fbd47072 100644
--- a/css/chat_style-TheEncrypted777.css
+++ b/css/chat_style-TheEncrypted777.css
@@ -4,8 +4,8 @@
     display: grid;
     align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
-    padding-bottom: 14px;
-    padding-top: 14px;
+    padding-bottom: 21px;
+    padding-top: 7px;
     font-size: 18px;
     font-family: 'Noto Sans', Arial, sans-serif;
     line-height: 1.428571429;
diff --git a/css/chat_style-cai-chat-square.css b/css/chat_style-cai-chat-square.css
index 015f6927..291a1209 100644
--- a/css/chat_style-cai-chat-square.css
+++ b/css/chat_style-cai-chat-square.css
@@ -16,7 +16,7 @@
 }
 
 .message {
-    padding-bottom: 1em;
-    padding-top: 1em;
+    padding-bottom: 1.5em;
+    padding-top: 0.5em;
     grid-template-columns: 70px minmax(0, 1fr);
 }
diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css
index 0e91101f..b06b1269 100644
--- a/css/chat_style-cai-chat.css
+++ b/css/chat_style-cai-chat.css
@@ -2,8 +2,8 @@
     display: grid;
     align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
-    padding-bottom: 1em;
-    padding-top: 1em;
+    padding-bottom: 1.5em;
+    padding-top: 0.5em;
     font-size: 15px;
     font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
     line-height: 22.5px !important;
diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css
index 6518d6ca..65af5f7a 100644
--- a/css/chat_style-messenger.css
+++ b/css/chat_style-messenger.css
@@ -1,6 +1,6 @@
 .message {
-    padding-bottom: 12.5px;
-    padding-top: 12.5px;
+    padding-bottom: 22px;
+    padding-top: 3px;
     font-size: 15px;
     font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
     line-height: 1.428571429;
diff --git a/css/chat_style-wpp.css b/css/chat_style-wpp.css
index 1442dd0a..353201c2 100644
--- a/css/chat_style-wpp.css
+++ b/css/chat_style-wpp.css
@@ -1,6 +1,6 @@
 .message {
-    padding-bottom: 12.5px;
-    padding-top: 12.5px;
+    padding-bottom: 22px;
+    padding-top: 3px;
     font-size: 15px;
     font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
     line-height: 1.428571429;

From 0f77ff9670364d605ad40bd1addd153213ce65f0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 19:19:22 -0700
Subject: [PATCH 097/164] UI: Use total VRAM (not free) for layers calculation
 when a model is loaded

---
 modules/models_settings.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 3a2400d4..6b9493ca 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -379,12 +379,15 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
     return vram
 
 
-def get_nvidia_free_vram():
+def get_nvidia_vram(return_free=True):
     """
-    Calculates the total free VRAM across all NVIDIA GPUs by parsing nvidia-smi output.
+    Calculates VRAM statistics across all NVIDIA GPUs by parsing nvidia-smi output.
+
+    Args:
+        return_free (bool): If True, returns free VRAM. If False, returns total VRAM.
 
     Returns:
-        int: The total free VRAM in MiB summed across all detected NVIDIA GPUs.
+        int: Either the total free VRAM or total VRAM in MiB summed across all detected NVIDIA GPUs.
              Returns -1 if nvidia-smi command fails (not found, error, etc.).
              Returns 0 if nvidia-smi succeeds but no GPU memory info found.
     """
@@ -412,17 +415,21 @@ def get_nvidia_free_vram():
             # No GPUs found in expected format
             return 0
 
+        total_vram_mib = 0
         total_free_vram_mib = 0
+
         for used_mem_str, total_mem_str in matches:
             try:
                 used_mib = int(used_mem_str)
                 total_mib = int(total_mem_str)
+                total_vram_mib += total_mib
                 total_free_vram_mib += (total_mib - used_mib)
             except ValueError:
                 # Skip malformed entries
                 pass
 
-        return total_free_vram_mib
+        # Return either free or total VRAM based on the flag
+        return total_free_vram_mib if return_free else total_vram_mib
 
     except FileNotFoundError:
         # nvidia-smi not found (likely no NVIDIA drivers installed)
@@ -473,8 +480,10 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type,
             # No user setting, auto-adjust from the maximum
             current_layers = max_layers  # Start from max
 
-            # Auto-adjust based on available VRAM
-            available_vram = get_nvidia_free_vram()
+            # Auto-adjust based on available/total VRAM
+            # If a model is loaded and it's for the UI, use the total VRAM to avoid confusion
+            return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True
+            available_vram = get_nvidia_vram(return_free=return_free)
             if available_vram > 0:
                 tolerance = 906
                 while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:

From 052c82b664faf109d2ded6c0250ad777259adb94 Mon Sep 17 00:00:00 2001
From: mamei16 <marcel.1710@live.de>
Date: Sat, 17 May 2025 16:19:13 +0200
Subject: [PATCH 098/164] Fix KeyError: 'gpu_layers' when loading existing
 model settings (#6991)

---
 modules/models_settings.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 6b9493ca..10234b4b 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -154,10 +154,11 @@ def get_model_metadata(model):
     for pat in settings:
         if re.match(pat.lower(), Path(model).name.lower()):
             for k in settings[pat]:
+                new_k = k
                 if k == 'n_gpu_layers':
-                    k = 'gpu_layers'
+                    new_k = 'gpu_layers'
 
-                model_settings[k] = settings[pat][k]
+                model_settings[new_k] = settings[pat][k]
 
     # Load instruction template if defined by name rather than by value
     if model_settings['instruction_template'] != 'Custom (obtained from model metadata)':

From 4800d1d522f84efaa50f4222aefd6fcae7e19e0c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 16 May 2025 22:33:32 -0700
Subject: [PATCH 099/164] More robust VRAM calculation

---
 modules/models_settings.py | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index 6b9493ca..81a7a00e 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -2,7 +2,7 @@ import functools
 import json
 import re
 import subprocess
-from math import exp
+from math import floor
 from pathlib import Path
 
 import gradio as gr
@@ -331,8 +331,6 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
     n_layers = None
     n_kv_heads = None
     embedding_dim = None
-    context_length = None
-    feed_forward_dim = None
 
     for key, value in metadata.items():
         if key.endswith('.block_count'):
@@ -341,10 +339,6 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
             n_kv_heads = value
         elif key.endswith('.embedding_length'):
             embedding_dim = value
-        elif key.endswith('.context_length'):
-            context_length = value
-        elif key.endswith('.feed_forward_length'):
-            feed_forward_dim = value
 
     if gpu_layers > n_layers:
         gpu_layers = n_layers
@@ -359,22 +353,16 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
 
     # Derived features
     size_per_layer = size_in_mb / max(n_layers, 1e-6)
-    context_per_layer = context_length / max(n_layers, 1e-6)
-    ffn_per_embedding = feed_forward_dim / max(embedding_dim, 1e-6)
     kv_cache_factor = n_kv_heads * cache_type * ctx_size
-
-    # Helper function for smaller
-    def smaller(x, y):
-        return 1 if x < y else 0
+    embedding_per_context = embedding_dim / ctx_size
 
     # Calculate VRAM using the model
     # Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
     vram = (
-        (size_per_layer - 21.19195204848197)
-        * exp(0.0001047328491557063 * size_in_mb * smaller(ffn_per_embedding, 2.671096993407845))
-        + 0.0006621544775632052 * context_per_layer
-        + 3.34664386576376e-05 * kv_cache_factor
-    ) * (1.363306170123392 + gpu_layers) + 1255.163594536052
+        (size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor)
+        * (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632)))
+        + 1516.522943869404
+    )
 
     return vram
 
@@ -485,7 +473,7 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type,
             return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True
             available_vram = get_nvidia_vram(return_free=return_free)
             if available_vram > 0:
-                tolerance = 906
+                tolerance = 577
                 while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
                     current_layers -= 1
 

From 366de4b5611a344c708d76aeebb3b2d8c42a55c7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 17 May 2025 17:11:38 -0700
Subject: [PATCH 100/164] UI: Fix the chat area height when "Show controls" is
 unchecked

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 3fec7bb0..1ad70122 100644
--- a/css/main.css
+++ b/css/main.css
@@ -827,7 +827,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-col.bigchat {
-    padding-bottom: 80px !important;
+    padding-bottom: 15px !important;
 }
 
 .message-body ol, .message-body ul {

From 076aa67963cd080837679662d79cb73326efb2ba Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 17 May 2025 22:14:14 -0700
Subject: [PATCH 101/164] Fix API issues

---
 extensions/openai/script.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index 2c98ee78..b7394bc5 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -114,14 +114,17 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
 
     if request_data.stream:
         async def generator():
-            async with streaming_semaphore:
-                response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
-                async for resp in iterate_in_threadpool(response):
-                    disconnected = await request.is_disconnected()
-                    if disconnected:
-                        break
+            try:
+                async with streaming_semaphore:
+                    response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
+                    async for resp in iterate_in_threadpool(response):
+                        disconnected = await request.is_disconnected()
+                        if disconnected:
+                            break
 
-                    yield {"data": json.dumps(resp)}
+                        yield {"data": json.dumps(resp)}
+            finally:
+                return
 
         return EventSourceResponse(generator())  # SSE streaming
 
@@ -142,14 +145,17 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
 
     if request_data.stream:
         async def generator():
-            async with streaming_semaphore:
-                response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
-                async for resp in iterate_in_threadpool(response):
-                    disconnected = await request.is_disconnected()
-                    if disconnected:
-                        break
+            try:
+                async with streaming_semaphore:
+                    response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
+                    async for resp in iterate_in_threadpool(response):
+                        disconnected = await request.is_disconnected()
+                        if disconnected:
+                            break
 
-                    yield {"data": json.dumps(resp)}
+                        yield {"data": json.dumps(resp)}
+            finally:
+                return
 
         return EventSourceResponse(generator())  # SSE streaming
 

From bd13a8f255ce7b637ee9e00fa7077752a6e56ca4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sat, 17 May 2025 22:31:55 -0700
Subject: [PATCH 102/164] UI: Light theme improvement

---
 css/main.css | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/css/main.css b/css/main.css
index 1ad70122..dc0ba817 100644
--- a/css/main.css
+++ b/css/main.css
@@ -131,7 +131,7 @@ gradio-app > :first-child {
 }
 
 .header_bar {
-    box-shadow: 0 0 3px rgba(22 22 22 / 35%);
+    border-right: var(--input-border-width) solid var(--input-border-color);
     margin-bottom: 0;
     overflow-x: scroll;
     text-wrap: nowrap;
@@ -1171,11 +1171,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     background-color: var(--light-theme-gray);
 }
 
-#chat-controls {
+.dark #chat-controls {
     border-left: 1px solid #d9d9d0;
 }
 
-#past-chats-row {
+.dark #past-chats-row {
     border-right: 1px solid #d9d9d0;
 }
 

From f1ec6c8662c0c8a744c827aa8d99036983aca8cc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 18 May 2025 09:04:51 -0700
Subject: [PATCH 103/164] Minor label changes

---
 modules/ui_chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 0856cfab..a0c37dad 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -79,8 +79,8 @@ def create_ui():
                 shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
 
             with gr.Row():
-                shared.gradio['send-chat-to-default'] = gr.Button('Send to default')
-                shared.gradio['send-chat-to-notebook'] = gr.Button('Send to notebook')
+                shared.gradio['send-chat-to-default'] = gr.Button('Send to Default')
+                shared.gradio['send-chat-to-notebook'] = gr.Button('Send to Notebook')
 
         with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']):
             with gr.Column():

From 2faaf18f1f9f4d29933017add849f8579021618c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 18 May 2025 09:06:20 -0700
Subject: [PATCH 104/164] Add back the "Common values" to the ctx-size slider

---
 modules/ui_model_menu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 5b7dfdd8..85cf4189 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -39,7 +39,7 @@ def create_ui():
                     with gr.Row():
                         with gr.Column():
                             shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
-                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
+                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
                             shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)

From 9d7a36356d7de6b7557928d199ccc94aa9c8f99e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 18 May 2025 10:56:16 -0700
Subject: [PATCH 105/164] Remove unnecessary js that was causing scrolling
 issues

---
 js/main.js | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/js/main.js b/js/main.js
index 6cecd341..01c346a7 100644
--- a/js/main.js
+++ b/js/main.js
@@ -132,8 +132,6 @@ targetElement.addEventListener("scroll", function() {
 
 // Create a MutationObserver instance
 const observer = new MutationObserver(function(mutations) {
-  updateCssProperties();
-
   if (targetElement.classList.contains("_generating")) {
     typing.parentNode.classList.add("visible-dots");
     document.getElementById("stop").style.display = "flex";
@@ -446,32 +444,6 @@ const chatInput = document.querySelector("#chat-input textarea");
 // Variables to store current dimensions
 let currentChatInputHeight = chatInput.clientHeight;
 
-// Update chat layout based on chat and input dimensions
-function updateCssProperties() {
-  const chatInputHeight = chatInput.clientHeight;
-
-  // Check if the chat container is visible
-  if (chatContainer.clientHeight > 0) {
-    // Adjust scrollTop based on input height change
-    if (chatInputHeight !== currentChatInputHeight) {
-      const deltaHeight = chatInputHeight - currentChatInputHeight;
-      if (!isScrolled && deltaHeight < 0) {
-        chatContainer.scrollTop = chatContainer.scrollHeight;
-      } else {
-        chatContainer.scrollTop += deltaHeight;
-      }
-
-      currentChatInputHeight = chatInputHeight;
-    }
-  }
-}
-
-// Observe textarea size changes and call update function
-new ResizeObserver(updateCssProperties).observe(document.querySelector("#chat-input textarea"));
-
-// Handle changes in window size
-window.addEventListener("resize", updateCssProperties);
-
 //------------------------------------------------
 // Focus on the rename text area when it becomes visible
 //------------------------------------------------

From 126b3a768fa9af7f5318dbfd70b7e6ad00defc68 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 18 May 2025 12:38:36 -0700
Subject: [PATCH 106/164] Revert "Dynamic Chat Message UI Update Speed (#6952)"
 (for now)

This reverts commit 8137eb8ef46ac6950cb96094e3cc30b0a72dee76.
---
 modules/shared.py                |  1 +
 modules/text_generation.py       | 18 ++++++++++--------
 modules/ui.py                    |  1 +
 modules/ui_parameters.py         |  2 ++
 user_data/settings-template.yaml |  1 +
 5 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/modules/shared.py b/modules/shared.py
index 4e0a20db..a6c0cbe9 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -47,6 +47,7 @@ settings = {
     'max_new_tokens_max': 4096,
     'prompt_lookup_num_tokens': 0,
     'max_tokens_second': 0,
+    'max_updates_second': 12,
     'auto_max_new_tokens': True,
     'ban_eos_token': False,
     'add_bos_token': True,
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 00b9275a..962311df 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -65,39 +65,41 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
             all_stop_strings += st
 
     shared.stop_everything = False
+    last_update = -1
     reply = ''
     is_stream = state['stream']
     if len(all_stop_strings) > 0 and not state['stream']:
         state = copy.deepcopy(state)
         state['stream'] = True
 
+    min_update_interval = 0
+    if state.get('max_updates_second', 0) > 0:
+        min_update_interval = 1 / state['max_updates_second']
+
     # Generate
-    last_update = -1
-    latency_threshold = 1 / 1000
     for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat):
-        cur_time = time.monotonic()
         reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
         if escape_html:
             reply = html.escape(reply)
 
         if is_stream:
+            cur_time = time.time()
+
             # Limit number of tokens/second to make text readable in real time
             if state['max_tokens_second'] > 0:
                 diff = 1 / state['max_tokens_second'] - (cur_time - last_update)
                 if diff > 0:
                     time.sleep(diff)
 
-                last_update = time.monotonic()
+                last_update = time.time()
                 yield reply
 
             # Limit updates to avoid lag in the Gradio UI
             # API updates are not limited
             else:
-                # If 'generate_func' takes less than 0.001 seconds to yield the next token
-                # (equivalent to more than 1000 tok/s), assume that the UI is lagging behind and skip yielding
-                if (cur_time - last_update) > latency_threshold:
+                if cur_time - last_update > min_update_interval:
+                    last_update = cur_time
                     yield reply
-                last_update = time.monotonic()
 
         if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything):
             break
diff --git a/modules/ui.py b/modules/ui.py
index eeb6ce92..25f93612 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -192,6 +192,7 @@ def list_interface_input_elements():
         'max_new_tokens',
         'prompt_lookup_num_tokens',
         'max_tokens_second',
+        'max_updates_second',
         'do_sample',
         'dynamic_temperature',
         'temperature_last',
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 84f9fbfc..733d0901 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -71,6 +71,8 @@ def create_ui(default_preset):
                             shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label='max_new_tokens', info='⚠️ Setting this too high can cause prompt truncation.')
                             shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
                             shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
+                            shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')
+
                 with gr.Column():
                     with gr.Row():
                         with gr.Column():
diff --git a/user_data/settings-template.yaml b/user_data/settings-template.yaml
index db481e84..ce0f77e1 100644
--- a/user_data/settings-template.yaml
+++ b/user_data/settings-template.yaml
@@ -18,6 +18,7 @@ max_new_tokens_min: 1
 max_new_tokens_max: 4096
 prompt_lookup_num_tokens: 0
 max_tokens_second: 0
+max_updates_second: 12
 auto_max_new_tokens: true
 ban_eos_token: false
 add_bos_token: true

From 83bfd5c64b44e9eada63963e8aff05a608a7e90c Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 18 May 2025 12:45:01 -0700
Subject: [PATCH 107/164] Fix API issues

---
 extensions/openai/script.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index b7394bc5..b6abae20 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -114,8 +114,8 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
 
     if request_data.stream:
         async def generator():
-            try:
-                async with streaming_semaphore:
+            async with streaming_semaphore:
+                try:
                     response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
                     async for resp in iterate_in_threadpool(response):
                         disconnected = await request.is_disconnected()
@@ -123,8 +123,9 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
                             break
 
                         yield {"data": json.dumps(resp)}
-            finally:
-                return
+                finally:
+                    stop_everything_event()
+                    return
 
         return EventSourceResponse(generator())  # SSE streaming
 
@@ -145,8 +146,8 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
 
     if request_data.stream:
         async def generator():
-            try:
-                async with streaming_semaphore:
+            async with streaming_semaphore:
+                try:
                     response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
                     async for resp in iterate_in_threadpool(response):
                         disconnected = await request.is_disconnected()
@@ -154,8 +155,9 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
                             break
 
                         yield {"data": json.dumps(resp)}
-            finally:
-                return
+                finally:
+                    stop_everything_event()
+                    return
 
         return EventSourceResponse(generator())  # SSE streaming
 

From 9cd6ea6c0b4b8fb99cf73d8fc0d1064db64fc2e8 Mon Sep 17 00:00:00 2001
From: Tiago Silva <tiagolsilva14@gmail.com>
Date: Sun, 18 May 2025 22:07:16 +0100
Subject: [PATCH 108/164] Fix Dockerfile in AMD and Intel (#6995)

---
 README.md                       |  4 ++--
 docker/amd/Dockerfile           |  2 +-
 docker/amd/docker-compose.yml   | 12 +-----------
 docker/intel/Dockerfile         |  2 +-
 docker/intel/docker-compose.yml | 10 +---------
 5 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 0833f9b0..041513ac 100644
--- a/README.md
+++ b/README.md
@@ -146,14 +146,14 @@ The `requirements*.txt` above contain various wheels precompiled through GitHub
 For NVIDIA GPU:
 ln -s docker/{nvidia/Dockerfile,nvidia/docker-compose.yml,.dockerignore} .
 For AMD GPU: 
-ln -s docker/{amd/Dockerfile,intel/docker-compose.yml,.dockerignore} .
+ln -s docker/{amd/Dockerfile,amd/docker-compose.yml,.dockerignore} .
 For Intel GPU:
 ln -s docker/{intel/Dockerfile,amd/docker-compose.yml,.dockerignore} .
 For CPU only
 ln -s docker/{cpu/Dockerfile,cpu/docker-compose.yml,.dockerignore} .
 cp docker/.env.example .env
 #Create logs/cache dir : 
-mkdir -p logs cache
+mkdir -p user_data/logs user_data/cache
 # Edit .env and set: 
 #   TORCH_CUDA_ARCH_LIST based on your GPU model
 #   APP_RUNTIME_GID      your host user's group id (run `id -g` in a terminal)
diff --git a/docker/amd/Dockerfile b/docker/amd/Dockerfile
index 66e5863c..c23083f7 100644
--- a/docker/amd/Dockerfile
+++ b/docker/amd/Dockerfile
@@ -14,7 +14,7 @@ WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
 RUN GPU_CHOICE=B LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
-COPY CMD_FLAGS.txt /home/app/text-generation-webui/
+COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 WORKDIR /home/app/text-generation-webui
 # set umask to ensure group read / write at runtime
diff --git a/docker/amd/docker-compose.yml b/docker/amd/docker-compose.yml
index 8866e9ed..a727ca3e 100644
--- a/docker/amd/docker-compose.yml
+++ b/docker/amd/docker-compose.yml
@@ -41,14 +41,4 @@ services:
     security_opt:
       - seccomp=unconfined
     volumes:
-      - ./cache:/home/app/text-generation-webui/cache
-      - ./characters:/home/app/text-generation-webui/characters
-      - ./extensions:/home/app/text-generation-webui/extensions
-      - ./loras:/home/app/text-generation-webui/loras
-      - ./logs:/home/app/text-generation-webui/logs
-      - ./models:/home/app/text-generation-webui/models
-      - ./presets:/home/app/text-generation-webui/presets
-      - ./prompts:/home/app/text-generation-webui/prompts
-      - ./softprompts:/home/app/text-generation-webui/softprompts
-      - ./training:/home/app/text-generation-webui/training
-      - ./cloudflared:/etc/cloudflared
+      - ./user_data:/home/app/text-generation-webui/user_data
diff --git a/docker/intel/Dockerfile b/docker/intel/Dockerfile
index cab62442..4a709803 100644
--- a/docker/intel/Dockerfile
+++ b/docker/intel/Dockerfile
@@ -14,7 +14,7 @@ WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
 RUN GPU_CHOICE=D LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
-COPY CMD_FLAGS.txt /home/app/text-generation-webui/
+COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 # set umask to ensure group read / write at runtime
 WORKDIR /home/app/text-generation-webui
diff --git a/docker/intel/docker-compose.yml b/docker/intel/docker-compose.yml
index 78e06698..bb48dd22 100644
--- a/docker/intel/docker-compose.yml
+++ b/docker/intel/docker-compose.yml
@@ -41,12 +41,4 @@ services:
     security_opt:
       - seccomp=unconfined
     volumes:
-      - ./characters:/home/app/text-generation-webui/characters
-      - ./extensions:/home/app/text-generation-webui/extensions
-      - ./loras:/home/app/text-generation-webui/loras
-      - ./models:/home/app/text-generation-webui/models
-      - ./presets:/home/app/text-generation-webui/presets
-      - ./prompts:/home/app/text-generation-webui/prompts
-      - ./softprompts:/home/app/text-generation-webui/softprompts
-      - ./training:/home/app/text-generation-webui/training
-      - ./cloudflared:/etc/cloudflared
+      - ./user_data:/home/app/text-generation-webui/user_data

From 0c1bc6d1d07559e6518786948e728e5899a3471e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 18 May 2025 14:08:54 -0700
Subject: [PATCH 109/164] Bump llama.cpp

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 16 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index af5f7d8a..1dcf8c93 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 4e011989..4a1702e9 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index a3bd1350..0caca631 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 363365bf..9a439798 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 2843fed2..16e77264 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -29,8 +29,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index bd7c4a4f..468f97fa 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index b5aa1cf7..eb7872ed 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 89947cbe..3ba42c0b 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 79959398..6831c461 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index ca16e4c7..fbb77ec0 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 18e1c506..71575b28 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -15,6 +15,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 693f4712..d093ab14 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 8635d11e..064d8e6c 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index e844596e..342239e8 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 9b7435d1..4ef3e97b 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 513b7a15..7b39feb1 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 0c7237e4b7c3de52f1de279134c12dcd0a41dcc9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 18 May 2025 20:01:29 -0700
Subject: [PATCH 110/164] Update README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 041513ac..ee5a04bf 100644
--- a/README.md
+++ b/README.md
@@ -14,8 +14,8 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2).
   - [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is also supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile).
-  - Additional quantization libraries like [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) can be used with the Transformers loader if you install them manually.
-- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for llama.cpp GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
+  - Additional quantization libraries like [HQQ](https://github.com/mobiusml/hqq) and [AQLM](https://github.com/Vahe1994/AQLM) can be used with the Transformers loader if you install them manually.
+- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
 - UI that resembles the original ChatGPT style.
 - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.

From 9ec46b8c4485742140cdbe118354155b88b99019 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Mon, 19 May 2025 09:23:24 -0700
Subject: [PATCH 111/164] Remove the HQQ loader (HQQ models can be loaded
 through Transformers)

---
 README.md                  |  4 +---
 modules/loaders.py         |  6 ------
 modules/models.py          | 16 ----------------
 modules/models_settings.py |  2 --
 modules/shared.py          |  8 +-------
 modules/ui.py              |  1 -
 modules/ui_model_menu.py   |  2 --
 7 files changed, 2 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index ee5a04bf..7105ce23 100644
--- a/README.md
+++ b/README.md
@@ -12,9 +12,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 ## Features
 
-- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2).
-  - [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is also supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile).
-  - Additional quantization libraries like [HQQ](https://github.com/mobiusml/hqq) and [AQLM](https://github.com/Vahe1994/AQLM) can be used with the Transformers loader if you install them manually.
+- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
 - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
 - UI that resembles the original ChatGPT style.
 - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
diff --git a/modules/loaders.py b/modules/loaders.py
index 79a7a4a3..6fbd2198 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -90,11 +90,6 @@ loaders_and_params = OrderedDict({
         'ctx_size_draft',
         'speculative_decoding_accordion',
     ],
-    'HQQ': [
-        'hqq_backend',
-        'trust_remote_code',
-        'no_use_fast',
-    ],
     'TensorRT-LLM': [
         'ctx_size',
         'cpp_runner',
@@ -158,7 +153,6 @@ def transformers_samplers():
 
 loaders_samplers = {
     'Transformers': transformers_samplers(),
-    'HQQ': transformers_samplers(),
     'ExLlamav3_HF': {
         'temperature',
         'dynatemp_low',
diff --git a/modules/models.py b/modules/models.py
index 9ecee803..4218d58c 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -21,7 +21,6 @@ def load_model(model_name, loader=None):
         'ExLlamav3_HF': ExLlamav3_HF_loader,
         'ExLlamav2_HF': ExLlamav2_HF_loader,
         'ExLlamav2': ExLlamav2_loader,
-        'HQQ': HQQ_loader,
         'TensorRT-LLM': TensorRT_LLM_loader,
     }
 
@@ -102,21 +101,6 @@ def ExLlamav2_loader(model_name):
     return model, tokenizer
 
 
-def HQQ_loader(model_name):
-    try:
-        from hqq.core.quantize import HQQBackend, HQQLinear
-        from hqq.models.hf.base import AutoHQQHFModel
-    except ModuleNotFoundError:
-        raise ModuleNotFoundError("Failed to import 'hqq'. Please install it manually following the instructions in the HQQ GitHub repository.")
-
-    logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"")
-
-    model_dir = Path(f'{shared.args.model_dir}/{model_name}')
-    model = AutoHQQHFModel.from_quantized(str(model_dir))
-    HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend))
-    return model
-
-
 def TensorRT_LLM_loader(model_name):
     try:
         from modules.tensorrt_llm import TensorRTLLMModel
diff --git a/modules/models_settings.py b/modules/models_settings.py
index 47dbc020..e742e0d8 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -183,8 +183,6 @@ def infer_loader(model_name, model_settings, hf_quant_method=None):
         loader = 'ExLlamav3_HF'
     elif re.match(r'.*exl2', model_name.lower()):
         loader = 'ExLlamav2_HF'
-    elif re.match(r'.*-hqq', model_name.lower()):
-        return 'HQQ'
     else:
         loader = 'Transformers'
 
diff --git a/modules/shared.py b/modules/shared.py
index a6c0cbe9..d2305f30 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -87,7 +87,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft
 
 # Model loader
 group = parser.add_argument_group('Model loader')
-group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ, TensorRT-LLM.')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
@@ -152,10 +152,6 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n
 group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
 group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
 
-# HQQ
-group = parser.add_argument_group('HQQ')
-group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
-
 # TensorRT-LLM
 group = parser.add_argument_group('TensorRT-LLM')
 group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
@@ -263,8 +259,6 @@ def fix_loader_name(name):
         return 'ExLlamav2_HF'
     elif name in ['exllamav3-hf', 'exllamav3_hf', 'exllama-v3-hf', 'exllama_v3_hf', 'exllama-v3_hf', 'exllama3-hf', 'exllama3_hf', 'exllama-3-hf', 'exllama_3_hf', 'exllama-3_hf']:
         return 'ExLlamav3_HF'
-    elif name in ['hqq']:
-        return 'HQQ'
     elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
         return 'TensorRT-LLM'
 
diff --git a/modules/ui.py b/modules/ui.py
index 25f93612..f5dc0632 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -109,7 +109,6 @@ def list_model_elements():
         'threads',
         'threads_batch',
         'batch_size',
-        'hqq_backend',
         'ctx_size',
         'cache_type',
         'tensor_split',
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 85cf4189..d361f692 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -42,8 +42,6 @@ def create_ui():
                             shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                             shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
-                            shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
-
                         with gr.Column():
                             shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
                             shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')

From 8e10f9894aee28088b176adc5bed91f80cc11b5c Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Tue, 20 May 2025 10:48:46 -0300
Subject: [PATCH 112/164] Add a metadata field to the chat history & add
 date/time to chat messages (#7003)

---
 css/main.css              |  33 +++++++++++
 modules/chat.py           | 114 ++++++++++++++++++++++++++++++++++++--
 modules/html_generator.py |  65 +++++++++++++++++++---
 modules/ui_chat.py        |   2 +-
 4 files changed, 200 insertions(+), 14 deletions(-)

diff --git a/css/main.css b/css/main.css
index dc0ba817..319c1778 100644
--- a/css/main.css
+++ b/css/main.css
@@ -419,6 +419,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     padding-right: 1rem;
 }
 
+.chat .message .timestamp {
+    font-size: 0.7em;
+    display: inline-block;
+    font-weight: normal;
+    opacity: 0.7;
+    margin-left: 5px;
+}
+
 .chat-parent.bigchat {
     flex: 1;
 }
@@ -1269,6 +1277,31 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     left: 75px;
 }
 
+.footer-button.footer-info-button {
+    bottom: -23px;
+}
+
+.user-message .footer-button.footer-info-button {
+    left: 25px;
+}
+
+.assistant-message:not(:last-child) .footer-button.footer-info-button {
+    left: 25px;
+}
+
+.assistant-message:last-child .footer-button.footer-info-button {
+    left: 100px;
+}
+
+.message:not(:last-child) .text-bot .footer-button.footer-info-button,
+.message .text-you .footer-button.footer-info-button {
+    left: 25px;
+}
+
+.message:last-child .text-bot .footer-button.footer-info-button {
+    left: 100px;
+}
+
 .message:hover .footer-button,
 .user-message:hover .footer-button,
 .assistant-message:hover .footer-button {
diff --git a/modules/chat.py b/modules/chat.py
index b83c4bfe..cbcde212 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -37,6 +37,30 @@ def strftime_now(format):
     return datetime.now().strftime(format)
 
 
+def get_current_timestamp():
+    """Returns the current time in 24-hour format"""
+    return datetime.now().strftime('%b %d, %Y %H:%M')
+
+
+def update_message_metadata(metadata_dict, role, index, **fields):
+    """
+    Updates or adds metadata fields for a specific message.
+
+    Args:
+        metadata_dict: The metadata dictionary
+        role: The role (user, assistant, etc)
+        index: The message index
+        **fields: Arbitrary metadata fields to update/add
+    """
+    key = f"{role}_{index}"
+    if key not in metadata_dict:
+        metadata_dict[key] = {}
+
+    # Update with provided fields
+    for field_name, field_value in fields.items():
+        metadata_dict[key][field_name] = field_value
+
+
 jinja_env = ImmutableSandboxedEnvironment(
     trim_blocks=True,
     lstrip_blocks=True,
@@ -347,6 +371,10 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     output = apply_extensions('history', output)
     state = apply_extensions('state', state)
 
+    # Initialize metadata if not present
+    if 'metadata' not in output:
+        output['metadata'] = {}
+
     visible_text = None
     stopping_strings = get_stopping_strings(state)
     is_stream = state['stream']
@@ -359,40 +387,56 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         text, visible_text = apply_extensions('chat_input', text, visible_text, state)
         text = apply_extensions('input', text, state, is_chat=True)
 
+        # Current row index
+        row_idx = len(output['internal'])
         output['internal'].append([text, ''])
         output['visible'].append([visible_text, ''])
+        # Add metadata with timestamp
+        update_message_metadata(output['metadata'], "user", row_idx, timestamp=get_current_timestamp())
 
         # *Is typing...*
         if loading_message:
             yield {
                 'visible': output['visible'][:-1] + [[output['visible'][-1][0], shared.processing_message]],
-                'internal': output['internal']
+                'internal': output['internal'],
+                'metadata': output['metadata']
             }
     else:
         text, visible_text = output['internal'][-1][0], output['visible'][-1][0]
         if regenerate:
+            row_idx = len(output['internal']) - 1
             if loading_message:
                 yield {
                     'visible': output['visible'][:-1] + [[visible_text, shared.processing_message]],
-                    'internal': output['internal'][:-1] + [[text, '']]
+                    'internal': output['internal'][:-1] + [[text, '']],
+                    'metadata': output['metadata']
                 }
         elif _continue:
             last_reply = [output['internal'][-1][1], output['visible'][-1][1]]
             if loading_message:
                 yield {
                     'visible': output['visible'][:-1] + [[visible_text, last_reply[1] + '...']],
-                    'internal': output['internal']
+                    'internal': output['internal'],
+                    'metadata': output['metadata']
                 }
 
     # Generate the prompt
     kwargs = {
         '_continue': _continue,
-        'history': output if _continue else {k: v[:-1] for k, v in output.items()}
+        'history': output if _continue else {
+            k: (v[:-1] if k in ['internal', 'visible'] else v)
+            for k, v in output.items()
+        }
     }
+
     prompt = apply_extensions('custom_generate_chat_prompt', text, state, **kwargs)
     if prompt is None:
         prompt = generate_chat_prompt(text, state, **kwargs)
 
+    # Add timestamp for assistant's response at the start of generation
+    row_idx = len(output['internal']) - 1
+    update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp())
+
     # Generate
     reply = None
     for j, reply in enumerate(generate_reply(prompt, state, stopping_strings=stopping_strings, is_chat=True, for_ui=for_ui)):
@@ -495,9 +539,19 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
 
 
 def remove_last_message(history):
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
     if len(history['visible']) > 0 and history['internal'][-1][0] != '<|BEGIN-VISIBLE-CHAT|>':
+        row_idx = len(history['internal']) - 1
         last = history['visible'].pop()
         history['internal'].pop()
+
+        # Remove metadata directly by known keys
+        if f"user_{row_idx}" in history['metadata']:
+            del history['metadata'][f"user_{row_idx}"]
+        if f"assistant_{row_idx}" in history['metadata']:
+            del history['metadata'][f"assistant_{row_idx}"]
     else:
         last = ['', '']
 
@@ -514,30 +568,54 @@ def send_last_reply_to_input(history):
 def replace_last_reply(text, state):
     history = state['history']
 
+    # Initialize metadata if not present
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
     if len(text.strip()) == 0:
         return history
     elif len(history['visible']) > 0:
+        row_idx = len(history['internal']) - 1
         history['visible'][-1][1] = html.escape(text)
         history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
+        update_message_metadata(history['metadata'], "assistant", row_idx, timestamp=get_current_timestamp())
 
     return history
 
 
 def send_dummy_message(text, state):
     history = state['history']
+
+    # Initialize metadata if not present
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
+    row_idx = len(history['internal'])
     history['visible'].append([html.escape(text), ''])
     history['internal'].append([apply_extensions('input', text, state, is_chat=True), ''])
+    update_message_metadata(history['metadata'], "user", row_idx, timestamp=get_current_timestamp())
+
     return history
 
 
 def send_dummy_reply(text, state):
     history = state['history']
+
+    # Initialize metadata if not present
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
     if len(history['visible']) > 0 and not history['visible'][-1][1] == '':
+        row_idx = len(history['internal'])
         history['visible'].append(['', ''])
         history['internal'].append(['', ''])
+        # We don't need to add system metadata
 
+    row_idx = len(history['internal']) - 1
     history['visible'][-1][1] = html.escape(text)
     history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
+    update_message_metadata(history['metadata'], "assistant", row_idx, timestamp=get_current_timestamp())
+
     return history
 
 
@@ -547,7 +625,8 @@ def redraw_html(history, name1, name2, mode, style, character, reset_cache=False
 
 def start_new_chat(state):
     mode = state['mode']
-    history = {'internal': [], 'visible': []}
+    # Initialize with empty metadata dictionary
+    history = {'internal': [], 'visible': [], 'metadata': {}}
 
     if mode != 'instruct':
         greeting = replace_character_names(state['greeting'], state['name1'], state['name2'])
@@ -555,6 +634,9 @@ def start_new_chat(state):
             history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]]
             history['visible'] += [['', apply_extensions('output', html.escape(greeting), state, is_chat=True)]]
 
+            # Add timestamp for assistant's greeting
+            update_message_metadata(history['metadata'], "assistant", 0, timestamp=get_current_timestamp())
+
     unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
     save_history(history, unique_id, state['character_menu'], state['mode'])
 
@@ -735,6 +817,16 @@ def load_history(unique_id, character, mode):
             'visible': f['data_visible']
         }
 
+    # Add metadata if it doesn't exist
+    if 'metadata' not in history:
+        history['metadata'] = {}
+        # Add placeholder timestamps for existing messages
+        for i, (user_msg, asst_msg) in enumerate(history['internal']):
+            if user_msg and user_msg != '<|BEGIN-VISIBLE-CHAT|>':
+                update_message_metadata(history['metadata'], "user", i, timestamp="")
+            if asst_msg:
+                update_message_metadata(history['metadata'], "assistant", i, timestamp="")
+
     return history
 
 
@@ -750,6 +842,16 @@ def load_history_json(file, history):
                 'visible': f['data_visible']
             }
 
+        # Add metadata if it doesn't exist
+        if 'metadata' not in history:
+            history['metadata'] = {}
+            # Add placeholder timestamps
+            for i, (user_msg, asst_msg) in enumerate(history['internal']):
+                if user_msg and user_msg != '<|BEGIN-VISIBLE-CHAT|>':
+                    update_message_metadata(history['metadata'], "user", i, timestamp="")
+                if asst_msg:
+                    update_message_metadata(history['metadata'], "assistant", i, timestamp="")
+
         return history
     except:
         return history
@@ -1299,7 +1401,7 @@ def handle_your_picture_change(picture, state):
 
 def handle_send_instruction_click(state):
     state['mode'] = 'instruct'
-    state['history'] = {'internal': [], 'visible': []}
+    state['history'] = {'internal': [], 'visible': [], 'metadata': {}}
 
     output = generate_chat_prompt("Input", state)
 
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 39659476..5dbde6da 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -169,11 +169,7 @@ def convert_to_markdown(string, message_id=None):
         thinking_block = f'''
         <details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
             <summary class="thinking-header">
-                <svg class="thinking-icon" width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
-                    <path d="M8 1.33334C4.31868 1.33334 1.33334 4.31868 1.33334 8.00001C1.33334 11.6813 4.31868 14.6667 8 14.6667C11.6813 14.6667 14.6667 11.6813 14.6667 8.00001C14.6667 4.31868 11.6813 1.33334 8 1.33334Z" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
-                    <path d="M8 10.6667V8.00001" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
-                    <path d="M8 5.33334H8.00667" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
-                </svg>
+                {info_svg_small}
                 <span class="thinking-title">{title_text}</span>
             </summary>
             <div class="thinking-content pretty_scrollbar">{thinking_html}</div>
@@ -339,11 +335,24 @@ copy_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" vie
 refresh_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-repeat"><path d="M4 12v-3a3 3 0 0 1 3 -3h13m-3 -3l3 3l-3 3"></path><path d="M20 12v3a3 3 0 0 1 -3 3h-13m3 3l-3 -3l3 -3"></path></svg>'''
 continue_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="20"  height="20"  viewBox="0 0 24 24"  fill="none"  stroke="currentColor"  stroke-width="2"  stroke-linecap="round"  stroke-linejoin="round"  class="icon icon-tabler icons-tabler-outline icon-tabler-player-play"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M7 4v16l13 -8z" /></svg>'''
 remove_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="20"  height="20"  viewBox="0 0 24 24"  fill="none"  stroke="currentColor"  stroke-width="2"  stroke-linecap="round"  stroke-linejoin="round"  class="icon icon-tabler icons-tabler-outline icon-tabler-trash"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M4 7l16 0" /><path d="M10 11l0 6" /><path d="M14 11l0 6" /><path d="M5 7l1 12a2 2 0 0 0 2 2h8a2 2 0 0 0 2 -2l1 -12" /><path d="M9 7v-3a1 1 0 0 1 1 -1h4a1 1 0 0 1 1 1v3" /></svg>'''
+info_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
+info_svg_small = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
 
 copy_button = f'<button class="footer-button footer-copy-button" title="Copy" onclick="copyToClipboard(this)">{copy_svg}</button>'
 refresh_button = f'<button class="footer-button footer-refresh-button" title="Regenerate" onclick="regenerateClick()">{refresh_svg}</button>'
 continue_button = f'<button class="footer-button footer-continue-button" title="Continue" onclick="continueClick()">{continue_svg}</button>'
 remove_button = f'<button class="footer-button footer-remove-button" title="Remove last reply" onclick="removeLastClick()">{remove_svg}</button>'
+info_button = f'<button class="footer-button footer-info-button" title="message">{info_svg}</button>'
+
+
+def format_message_timestamp(history, role, index):
+    """Get a formatted timestamp HTML span for a message if available"""
+    key = f"{role}_{index}"
+    if 'metadata' in history and key in history['metadata'] and history['metadata'][key].get('timestamp'):
+        timestamp = history['metadata'][key]['timestamp']
+        return f"<span class='timestamp'>{timestamp}</span>"
+
+    return ""
 
 
 def generate_instruct_html(history):
@@ -354,6 +363,23 @@ def generate_instruct_html(history):
         row_internal = history['internal'][i]
         converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
 
+        # Get timestamps
+        user_timestamp = format_message_timestamp(history, "user", i)
+        assistant_timestamp = format_message_timestamp(history, "assistant", i)
+
+        # Create info buttons for timestamps if they exist
+        info_message_user = ""
+        if user_timestamp != "":
+            # Extract the timestamp value from the span
+            user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0]
+            info_message_user = info_button.replace("message", user_timestamp_value)
+
+        info_message_assistant = ""
+        if assistant_timestamp != "":
+            # Extract the timestamp value from the span
+            assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0]
+            info_message_assistant = info_button.replace("message", assistant_timestamp_value)
+
         if converted_visible[0]:  # Don't display empty user messages
             output += (
                 f'<div class="user-message" '
@@ -361,6 +387,7 @@ def generate_instruct_html(history):
                 f'<div class="text">'
                 f'<div class="message-body">{converted_visible[0]}</div>'
                 f'{copy_button}'
+                f'{info_message_user}'
                 f'</div>'
                 f'</div>'
             )
@@ -374,6 +401,7 @@ def generate_instruct_html(history):
             f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
             f'{continue_button if i == len(history["visible"]) - 1 else ""}'
             f'{remove_button if i == len(history["visible"]) - 1 else ""}'
+            f'{info_message_assistant}'
             f'</div>'
             f'</div>'
         )
@@ -401,13 +429,17 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
         row_internal = history['internal'][i]
         converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
 
+        # Get timestamps
+        user_timestamp = format_message_timestamp(history, "user", i)
+        assistant_timestamp = format_message_timestamp(history, "assistant", i)
+
         if converted_visible[0]:  # Don't display empty user messages
             output += (
                 f'<div class="message" '
                 f'data-raw="{html.escape(row_internal[0], quote=True)}">'
                 f'<div class="circle-you">{img_me}</div>'
                 f'<div class="text">'
-                f'<div class="username">{name1}</div>'
+                f'<div class="username">{name1}{user_timestamp}</div>'
                 f'<div class="message-body">{converted_visible[0]}</div>'
                 f'{copy_button}'
                 f'</div>'
@@ -419,7 +451,7 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
             f'data-raw="{html.escape(row_internal[1], quote=True)}">'
             f'<div class="circle-bot">{img_bot}</div>'
             f'<div class="text">'
-            f'<div class="username">{name2}</div>'
+            f'<div class="username">{name2}{assistant_timestamp}</div>'
             f'<div class="message-body">{converted_visible[1]}</div>'
             f'{copy_button}'
             f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
@@ -441,6 +473,23 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
         row_internal = history['internal'][i]
         converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
 
+        # Get timestamps
+        user_timestamp = format_message_timestamp(history, "user", i)
+        assistant_timestamp = format_message_timestamp(history, "assistant", i)
+
+        # Create info buttons for timestamps if they exist
+        info_message_user = ""
+        if user_timestamp != "":
+            # Extract the timestamp value from the span
+            user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0]
+            info_message_user = info_button.replace("message", user_timestamp_value)
+
+        info_message_assistant = ""
+        if assistant_timestamp != "":
+            # Extract the timestamp value from the span
+            assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0]
+            info_message_assistant = info_button.replace("message", assistant_timestamp_value)
+
         if converted_visible[0]:  # Don't display empty user messages
             output += (
                 f'<div class="message" '
@@ -448,6 +497,7 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
                 f'<div class="text-you">'
                 f'<div class="message-body">{converted_visible[0]}</div>'
                 f'{copy_button}'
+                f'{info_message_user}'
                 f'</div>'
                 f'</div>'
             )
@@ -461,6 +511,7 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
             f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
             f'{continue_button if i == len(history["visible"]) - 1 else ""}'
             f'{remove_button if i == len(history["visible"]) - 1 else ""}'
+            f'{info_message_assistant}'
             f'</div>'
             f'</div>'
         )
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index a0c37dad..7a5430ca 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -47,7 +47,7 @@ def create_ui():
         with gr.Row():
             with gr.Column(elem_id='chat-col'):
                 shared.gradio['display'] = gr.JSON(value={}, visible=False)  # Hidden buffer
-                shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
+                shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': [], 'metadata': {}}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
                 with gr.Row(elem_id="chat-input-row"):
                     with gr.Column(scale=1, elem_id='gr-hover-container'):
                         gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')

From c25a381540eb8c40e945730b058ca3e83fe0674c Mon Sep 17 00:00:00 2001
From: Daniel Dengler <yasokhuul@googlemail.com>
Date: Tue, 20 May 2025 16:07:40 +0200
Subject: [PATCH 113/164] Add a "Branch here" footer button to chat messages
 (#6967)

---
 css/main.css              | 66 +++++++++------------------------------
 js/global_scope_js.js     | 31 ++++++++++++++++++
 modules/chat.py           | 10 ++++--
 modules/html_generator.py | 47 +++++++++++++++-------------
 modules/ui.py             |  1 +
 modules/ui_chat.py        |  5 +--
 6 files changed, 83 insertions(+), 77 deletions(-)

diff --git a/css/main.css b/css/main.css
index 319c1778..d7142336 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1244,67 +1244,31 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     position: relative;
 }
 
-.footer-button {
+/* New container for the buttons */
+.message-actions {
     position: absolute;
+    bottom: -23px;
+    left: 0;
+    display: flex;   
+    gap: 5px;
+    opacity: 0;
+    transition: opacity 0.2s;
+}
+
+.footer-button {
     padding: 0;
     margin: 0;
     border: none;
     border-radius: 3px;
     cursor: pointer;
-    opacity: 0;
     display: flex;
     align-items: center;
-    transition: opacity 0.2s;
+    justify-content: center;
 }
 
-.footer-button.footer-copy-button {
-    bottom: -23px;
-    left: 0;
-}
-
-.footer-button.footer-refresh-button {
-    bottom: -23px;
-    left: 25px;
-}
-
-.footer-button.footer-continue-button {
-    bottom: -23px;
-    left: 50px;
-}
-
-.footer-button.footer-remove-button {
-    bottom: -23px;
-    left: 75px;
-}
-
-.footer-button.footer-info-button {
-    bottom: -23px;
-}
-
-.user-message .footer-button.footer-info-button {
-    left: 25px;
-}
-
-.assistant-message:not(:last-child) .footer-button.footer-info-button {
-    left: 25px;
-}
-
-.assistant-message:last-child .footer-button.footer-info-button {
-    left: 100px;
-}
-
-.message:not(:last-child) .text-bot .footer-button.footer-info-button,
-.message .text-you .footer-button.footer-info-button {
-    left: 25px;
-}
-
-.message:last-child .text-bot .footer-button.footer-info-button {
-    left: 100px;
-}
-
-.message:hover .footer-button,
-.user-message:hover .footer-button,
-.assistant-message:hover .footer-button {
+.message:hover .message-actions,
+.user-message:hover .message-actions,
+.assistant-message:hover .message-actions {
     opacity: 1;
 }
 
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 29d2d8bd..285d82f9 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -18,6 +18,37 @@ function copyToClipboard(element) {
   });
 }
 
+function branchHere(element) {
+  if (!element) return;
+
+  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  if (!messageElement) return;
+
+  const index = messageElement.getAttribute("data-index");
+  if (!index) return;
+
+  const branchIndexInput = document.getElementById("Branch-index").querySelector("input");
+  if (!branchIndexInput) {
+    console.error("Element with ID 'Branch-index' not found.");
+    return;
+  }
+  const branchButton = document.getElementById("Branch");
+
+  if (!branchButton) {
+    console.error("Required element 'Branch' not found.");
+    return;
+  }
+
+  branchIndexInput.value = index;
+
+  // Trigger any 'change' or 'input' events Gradio might be listening for
+  const event = new Event("input", { bubbles: true }); // 'change' might also work
+  branchIndexInput.dispatchEvent(event);
+
+  branchButton.click(); // Gradio will now pick up the 'index'
+
+}
+
 function regenerateClick() {
   document.getElementById("Regenerate").click();
 }
diff --git a/modules/chat.py b/modules/chat.py
index cbcde212..13f733e9 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1248,7 +1248,13 @@ def handle_delete_chat_confirm_click(state):
 
 
 def handle_branch_chat_click(state):
-    history = state['history']
+    branch_from_index = state['branch_index']
+    if branch_from_index == -1:
+        history = state['history']
+    else:
+        history = state['history']
+        history['visible'] = history['visible'][:branch_from_index + 1]
+        history['internal'] = history['internal'][:branch_from_index + 1]
     new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
     save_history(history, new_unique_id, state['character_menu'], state['mode'])
 
@@ -1259,7 +1265,7 @@ def handle_branch_chat_click(state):
 
     past_chats_update = gr.update(choices=histories, value=new_unique_id)
 
-    return [history, html, past_chats_update]
+    return [history, html, past_chats_update, -1]
 
 
 def handle_rename_chat_click():
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 5dbde6da..36b31ac5 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -335,10 +335,12 @@ copy_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" vie
 refresh_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-repeat"><path d="M4 12v-3a3 3 0 0 1 3 -3h13m-3 -3l3 3l-3 3"></path><path d="M20 12v3a3 3 0 0 1 -3 3h-13m3 3l-3 -3l3 -3"></path></svg>'''
 continue_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="20"  height="20"  viewBox="0 0 24 24"  fill="none"  stroke="currentColor"  stroke-width="2"  stroke-linecap="round"  stroke-linejoin="round"  class="icon icon-tabler icons-tabler-outline icon-tabler-player-play"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M7 4v16l13 -8z" /></svg>'''
 remove_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="20"  height="20"  viewBox="0 0 24 24"  fill="none"  stroke="currentColor"  stroke-width="2"  stroke-linecap="round"  stroke-linejoin="round"  class="icon icon-tabler icons-tabler-outline icon-tabler-trash"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M4 7l16 0" /><path d="M10 11l0 6" /><path d="M14 11l0 6" /><path d="M5 7l1 12a2 2 0 0 0 2 2h8a2 2 0 0 0 2 -2l1 -12" /><path d="M9 7v-3a1 1 0 0 1 1 -1h4a1 1 0 0 1 1 1v3" /></svg>'''
+branch_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="24"  height="24"  viewBox="0 0 24 24"  fill="none"  stroke="currentColor"  stroke-width="2"  stroke-linecap="round"  stroke-linejoin="round"  class="icon icon-tabler icons-tabler-outline icon-tabler-git-branch"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M7 18m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M7 6m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M17 6m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M7 8l0 8" /><path d="M9 18h6a2 2 0 0 0 2 -2v-5" /><path d="M14 14l3 -3l3 3" /></svg>'''
 info_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
 info_svg_small = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
 
 copy_button = f'<button class="footer-button footer-copy-button" title="Copy" onclick="copyToClipboard(this)">{copy_svg}</button>'
+branch_button = f'<button class="footer-button footer-branch-button" title="Branch here" onclick="branchHere(this)">{branch_svg}</button>'
 refresh_button = f'<button class="footer-button footer-refresh-button" title="Regenerate" onclick="regenerateClick()">{refresh_svg}</button>'
 continue_button = f'<button class="footer-button footer-continue-button" title="Continue" onclick="continueClick()">{continue_svg}</button>'
 remove_button = f'<button class="footer-button footer-remove-button" title="Remove last reply" onclick="removeLastClick()">{remove_svg}</button>'
@@ -355,6 +357,17 @@ def format_message_timestamp(history, role, index):
     return ""
 
 
+def actions_html(history, i, info_message=""):
+    return (f'<div class="message-actions">'
+            f'{copy_button}'
+            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
+            f'{continue_button if i == len(history["visible"]) - 1 else ""}'
+            f'{remove_button if i == len(history["visible"]) - 1 else ""}'
+            f'{branch_button}'
+            f'{info_message}'
+            f'</div>')
+
+
 def generate_instruct_html(history):
     output = f'<style>{instruct_css}</style><div class="chat" id="chat" data-mode="instruct"><div class="messages">'
 
@@ -386,22 +399,18 @@ def generate_instruct_html(history):
                 f'data-raw="{html.escape(row_internal[0], quote=True)}">'
                 f'<div class="text">'
                 f'<div class="message-body">{converted_visible[0]}</div>'
-                f'{copy_button}'
-                f'{info_message_user}'
+                f'<div class="message-actions">{copy_button}{info_message_user}</div>'
                 f'</div>'
                 f'</div>'
             )
 
         output += (
             f'<div class="assistant-message" '
-            f'data-raw="{html.escape(row_internal[1], quote=True)}">'
+            f'data-raw="{html.escape(row_internal[1], quote=True)}"'
+            f'data-index={i}>'
             f'<div class="text">'
             f'<div class="message-body">{converted_visible[1]}</div>'
-            f'{copy_button}'
-            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
-            f'{continue_button if i == len(history["visible"]) - 1 else ""}'
-            f'{remove_button if i == len(history["visible"]) - 1 else ""}'
-            f'{info_message_assistant}'
+            f'{actions_html(history, i, info_message_assistant)}'
             f'</div>'
             f'</div>'
         )
@@ -441,22 +450,20 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
                 f'<div class="text">'
                 f'<div class="username">{name1}{user_timestamp}</div>'
                 f'<div class="message-body">{converted_visible[0]}</div>'
-                f'{copy_button}'
+                f'<div class="message-actions">{copy_button}</div>'
                 f'</div>'
                 f'</div>'
             )
 
         output += (
             f'<div class="message" '
-            f'data-raw="{html.escape(row_internal[1], quote=True)}">'
+            f'data-raw="{html.escape(row_internal[1], quote=True)}"'
+            f'data-index={i}>'
             f'<div class="circle-bot">{img_bot}</div>'
             f'<div class="text">'
             f'<div class="username">{name2}{assistant_timestamp}</div>'
             f'<div class="message-body">{converted_visible[1]}</div>'
-            f'{copy_button}'
-            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
-            f'{continue_button if i == len(history["visible"]) - 1 else ""}'
-            f'{remove_button if i == len(history["visible"]) - 1 else ""}'
+            f'{actions_html(history, i)}'
             f'</div>'
             f'</div>'
         )
@@ -496,22 +503,18 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
                 f'data-raw="{html.escape(row_internal[0], quote=True)}">'
                 f'<div class="text-you">'
                 f'<div class="message-body">{converted_visible[0]}</div>'
-                f'{copy_button}'
-                f'{info_message_user}'
+                f'<div class="message-actions">{copy_button}{info_message_user}</div>'
                 f'</div>'
                 f'</div>'
             )
 
         output += (
             f'<div class="message" '
-            f'data-raw="{html.escape(row_internal[1], quote=True)}">'
+            f'data-raw="{html.escape(row_internal[1], quote=True)}"'
+            f'data-index={i}>'
             f'<div class="text-bot">'
             f'<div class="message-body">{converted_visible[1]}</div>'
-            f'{copy_button}'
-            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
-            f'{continue_button if i == len(history["visible"]) - 1 else ""}'
-            f'{remove_button if i == len(history["visible"]) - 1 else ""}'
-            f'{info_message_assistant}'
+            f'{actions_html(history, i, info_message_assistant)}'
             f'</div>'
             f'</div>'
         )
diff --git a/modules/ui.py b/modules/ui.py
index f5dc0632..5e8fa14e 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -210,6 +210,7 @@ def list_interface_input_elements():
         'negative_prompt',
         'dry_sequence_breakers',
         'grammar_string',
+        'branch_index'
     ]
 
     # Chat elements
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 7a5430ca..513a632b 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -24,7 +24,8 @@ def create_ui():
         with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
             with gr.Column():
                 with gr.Row(elem_id='past-chats-buttons'):
-                    shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', elem_id='Branch', interactive=not mu)
+                    shared.gradio['branch_index'] = gr.Number(value=-1, precision=0, visible=False, elem_id="Branch-index", interactive=True)
                     shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input'])
@@ -258,7 +259,7 @@ def create_event_handlers():
 
     shared.gradio['branch_chat'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
+        chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id', 'branch_index'), show_progress=False)
 
     shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename-row'), show_progress=False)
     shared.gradio['rename_to-cancel'].click(lambda: gr.update(visible=False), None, gradio('rename-row'), show_progress=False)

From 616ea6966d4821357076ff0c3b0a37967b736dd1 Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Tue, 20 May 2025 12:51:28 -0300
Subject: [PATCH 114/164] Store previous reply versions on regenerate (#7004)

---
 modules/chat.py | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/modules/chat.py b/modules/chat.py
index 13f733e9..3efc55db 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -365,6 +365,34 @@ def get_stopping_strings(state):
     return result
 
 
+def add_message_version(history, row_idx, is_current=True):
+    """Add the current message as a version in the history metadata"""
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
+    if row_idx >= len(history['internal']) or not history['internal'][row_idx][1].strip():
+        return  # Skip if row doesn't exist or message is empty
+
+    key = f"assistant_{row_idx}"
+
+    # Initialize metadata structures if needed
+    if key not in history['metadata']:
+        history['metadata'][key] = {"timestamp": get_current_timestamp()}
+    if "versions" not in history['metadata'][key]:
+        history['metadata'][key]["versions"] = []
+
+    # Add current message as a version
+    history['metadata'][key]["versions"].append({
+        "content": history['internal'][row_idx][1],
+        "visible_content": history['visible'][row_idx][1],
+        "timestamp": get_current_timestamp()
+    })
+
+    # Update index if this is the current version
+    if is_current:
+        history['metadata'][key]["current_version_index"] = len(history['metadata'][key]["versions"]) - 1
+
+
 def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False):
     history = state['history']
     output = copy.deepcopy(history)
@@ -405,6 +433,10 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         text, visible_text = output['internal'][-1][0], output['visible'][-1][0]
         if regenerate:
             row_idx = len(output['internal']) - 1
+
+            # Store the existing response as a version before regenerating
+            add_message_version(output, row_idx, is_current=False)
+
             if loading_message:
                 yield {
                     'visible': output['visible'][:-1] + [[visible_text, shared.processing_message]],
@@ -465,6 +497,11 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             if is_stream:
                 yield output
 
+    # Add the newly generated response as a version (only for regeneration)
+    if regenerate:
+        row_idx = len(output['internal']) - 1
+        add_message_version(output, row_idx, is_current=True)
+
     output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
     yield output
 

From 51c50b265d50a46b345b1b1d4afa55b5c94d5063 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 20 May 2025 11:15:38 -0700
Subject: [PATCH 115/164] Update llama.cpp to
 https://github.com/ggml-org/llama.cpp/commit/b7a17463ec190aeee7b9077c606c910fb4688b84

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 16 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 1dcf8c93..c65ab8a2 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 4a1702e9..3da16d3e 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 0caca631..271b4bd0 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 9a439798..15df937c 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 16e77264..bd2f8339 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -29,8 +29,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 468f97fa..98c25649 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index eb7872ed..6e13c1d2 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 3ba42c0b..67a5cb73 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -30,8 +30,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 6831c461..409252f6 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index fbb77ec0..89adbabf 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 71575b28..0b1c03fa 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -15,6 +15,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index d093ab14..eb4319b7 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 064d8e6c..0a60d4de 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 342239e8..652e9900 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 4ef3e97b..c83d61c7 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 7b39feb1..e69f3bdf 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 5d00574a566ac8c66af16f76c9cbda6696e46e00 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 20 May 2025 16:20:49 -0700
Subject: [PATCH 116/164] Minor UI fixes

---
 modules/models_settings.py | 4 ++--
 modules/ui_model_menu.py   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index e742e0d8..df5a8e8d 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -438,7 +438,7 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type,
         - If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage
     """
     if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"):
-        vram_info = "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
+        vram_info = "<div id=\"vram-info\"'>Estimated VRAM to load the model:</div>"
         if for_ui:
             return (vram_info, gr.update()) if auto_adjust else vram_info
         else:
@@ -480,7 +480,7 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type,
     vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type)
 
     if for_ui:
-        vram_info = f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{vram_usage:.0f} MiB</span>"
+        vram_info = f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{vram_usage:.0f} MiB</span></div>"
         if auto_adjust:
             return vram_info, gr.update(value=current_layers, maximum=max_layers)
         else:
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index d361f692..862b3893 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -310,7 +310,7 @@ def get_initial_vram_info():
             for_ui=True
         )
 
-    return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
+    return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</div>"
 
 
 def get_initial_gpu_layers_max():

From 409a48d6bdd0f2bc861fc459cdd701d697bdd188 Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Wed, 21 May 2025 00:36:20 -0300
Subject: [PATCH 117/164] Add attachments support (text files, PDF documents)
 (#7005)

---
 css/main.css                                  |  56 ++++++++
 modules/chat.py                               | 124 ++++++++++++++++--
 modules/html_generator.py                     |  41 ++++++
 modules/ui_chat.py                            |   6 +-
 requirements/full/requirements.txt            |   1 +
 requirements/full/requirements_amd.txt        |   1 +
 requirements/full/requirements_amd_noavx2.txt |   1 +
 .../full/requirements_apple_intel.txt         |   1 +
 .../full/requirements_apple_silicon.txt       |   1 +
 requirements/full/requirements_cpu_only.txt   |   1 +
 .../full/requirements_cpu_only_noavx2.txt     |   1 +
 requirements/full/requirements_noavx2.txt     |   1 +
 requirements/full/requirements_nowheels.txt   |   1 +
 requirements/portable/requirements.txt        |   1 +
 .../portable/requirements_apple_intel.txt     |   1 +
 .../portable/requirements_apple_silicon.txt   |   1 +
 .../portable/requirements_cpu_only.txt        |   1 +
 .../portable/requirements_cpu_only_noavx2.txt |   1 +
 requirements/portable/requirements_noavx2.txt |   1 +
 .../portable/requirements_nowheels.txt        |   1 +
 requirements/portable/requirements_vulkan.txt |   1 +
 .../portable/requirements_vulkan_noavx2.txt   |   1 +
 22 files changed, 233 insertions(+), 12 deletions(-)

diff --git a/css/main.css b/css/main.css
index d7142336..6cb99fc3 100644
--- a/css/main.css
+++ b/css/main.css
@@ -592,6 +592,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     padding: 0.65rem 2.5rem;
     border: 0;
     box-shadow: 0;
+    border-radius: 8px;
 }
 
 #chat-input textarea::placeholder {
@@ -611,6 +612,16 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     display: none;
 }
 
+#chat-input .submit-button {
+    display: none;
+}
+
+#chat-input .upload-button {
+    margin-right: 16px;
+    margin-bottom: 7px;
+    background: transparent;
+}
+
 .chat-input-positioned {
     max-width: 54rem;
     left: 50%;
@@ -1395,3 +1406,48 @@ strong {
 .dark #vram-info .value {
     color: #07ff07;
 }
+
+.message-attachments {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 8px;
+    margin-top: 8px;
+}
+
+.attachment-box {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    padding: 8px;
+    background: rgb(0 0 0 / 5%);
+    border-radius: 6px;
+    border: 1px solid rgb(0 0 0 / 10%);
+    min-width: 80px;
+    max-width: 120px;
+}
+
+.attachment-icon {
+    margin-bottom: 4px;
+    color: #555;
+}
+
+.attachment-name {
+    font-size: 0.8em;
+    text-align: center;
+    word-break: break-word;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    display: -webkit-box;
+    -webkit-line-clamp: 2;
+    -webkit-box-orient: vertical;
+}
+
+.dark .attachment-box {
+    background: rgb(255 255 255 / 5%);
+    border: 1px solid rgb(255 255 255 / 10%);
+}
+
+.dark .attachment-icon {
+    color: #ccc;
+}
diff --git a/modules/chat.py b/modules/chat.py
index 3efc55db..cdd50c92 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -157,7 +157,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
     impersonate = kwargs.get('impersonate', False)
     _continue = kwargs.get('_continue', False)
     also_return_rows = kwargs.get('also_return_rows', False)
-    history = kwargs.get('history', state['history'])['internal']
+    history_data = kwargs.get('history', state['history'])
+    history = history_data['internal']
+    metadata = history_data.get('metadata', {})
 
     # Templates
     chat_template_str = state['chat_template_str']
@@ -196,11 +198,13 @@ def generate_chat_prompt(user_input, state, **kwargs):
             messages.append({"role": "system", "content": context})
 
     insert_pos = len(messages)
-    for entry in reversed(history):
+    for i, entry in enumerate(reversed(history)):
         user_msg = entry[0].strip()
         assistant_msg = entry[1].strip()
         tool_msg = entry[2].strip() if len(entry) > 2 else ''
 
+        row_idx = len(history) - i - 1
+
         if tool_msg:
             messages.insert(insert_pos, {"role": "tool", "content": tool_msg})
 
@@ -208,10 +212,40 @@ def generate_chat_prompt(user_input, state, **kwargs):
             messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
 
         if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']:
-            messages.insert(insert_pos, {"role": "user", "content": user_msg})
+            # Check for user message attachments in metadata
+            user_key = f"user_{row_idx}"
+            enhanced_user_msg = user_msg
+
+            # Add attachment content if present
+            if user_key in metadata and "attachments" in metadata[user_key]:
+                attachments_text = ""
+                for attachment in metadata[user_key]["attachments"]:
+                    filename = attachment.get("name", "file")
+                    content = attachment.get("content", "")
+                    attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+
+                if attachments_text:
+                    enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:{attachments_text}"
+
+            messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg})
 
     user_input = user_input.strip()
     if user_input and not impersonate and not _continue:
+        # For the current user input being processed, check if we need to add attachments
+        if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
+            current_row_idx = len(history)
+            user_key = f"user_{current_row_idx}"
+
+            if user_key in metadata and "attachments" in metadata[user_key]:
+                attachments_text = ""
+                for attachment in metadata[user_key]["attachments"]:
+                    filename = attachment.get("name", "file")
+                    content = attachment.get("content", "")
+                    attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+
+                if attachments_text:
+                    user_input = f"{user_input}\n\nATTACHMENTS:{attachments_text}"
+
         messages.append({"role": "user", "content": user_input})
 
     def make_prompt(messages):
@@ -280,7 +314,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
             # Resort to truncating the user input
             else:
-
                 user_message = messages[-1]['content']
 
                 # Bisect the truncation point
@@ -393,7 +426,74 @@ def add_message_version(history, row_idx, is_current=True):
         history['metadata'][key]["current_version_index"] = len(history['metadata'][key]["versions"]) - 1
 
 
+def add_message_attachment(history, row_idx, file_path, is_user=True):
+    """Add a file attachment to a message in history metadata"""
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
+    key = f"{'user' if is_user else 'assistant'}_{row_idx}"
+
+    if key not in history['metadata']:
+        history['metadata'][key] = {"timestamp": get_current_timestamp()}
+    if "attachments" not in history['metadata'][key]:
+        history['metadata'][key]["attachments"] = []
+
+    # Get file info using pathlib
+    path = Path(file_path)
+    filename = path.name
+    file_extension = path.suffix.lower()
+
+    try:
+        # Handle different file types
+        if file_extension == '.pdf':
+            # Process PDF file
+            content = extract_pdf_text(path)
+            file_type = "application/pdf"
+        else:
+            # Default handling for text files
+            with open(path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            file_type = "text/plain"
+
+        # Add attachment
+        attachment = {
+            "name": filename,
+            "type": file_type,
+            "content": content,
+        }
+
+        history['metadata'][key]["attachments"].append(attachment)
+        return content  # Return the content for reuse
+    except Exception as e:
+        logger.error(f"Error processing attachment {filename}: {e}")
+        return None
+
+
+def extract_pdf_text(pdf_path):
+    """Extract text from a PDF file"""
+    import PyPDF2
+
+    text = ""
+    try:
+        with open(pdf_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            for page_num in range(len(pdf_reader.pages)):
+                page = pdf_reader.pages[page_num]
+                text += page.extract_text() + "\n\n"
+
+        return text.strip()
+    except Exception as e:
+        logger.error(f"Error extracting text from PDF: {e}")
+        return f"[Error extracting PDF text: {str(e)}]"
+
+
 def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False):
+    # Handle dict format with text and files
+    files = []
+    if isinstance(text, dict):
+        files = text.get('files', [])
+        text = text.get('text', '')
+
     history = state['history']
     output = copy.deepcopy(history)
     output = apply_extensions('history', output)
@@ -411,12 +511,18 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     if not (regenerate or _continue):
         visible_text = html.escape(text)
 
+        # Process file attachments and store in metadata
+        row_idx = len(output['internal'])
+
+        # Add attachments to metadata only, not modifying the message text
+        for file_path in files:
+            add_message_attachment(output, row_idx, file_path, is_user=True)
+
         # Apply extensions
         text, visible_text = apply_extensions('chat_input', text, visible_text, state)
         text = apply_extensions('input', text, state, is_chat=True)
 
         # Current row index
-        row_idx = len(output['internal'])
         output['internal'].append([text, ''])
         output['visible'].append([visible_text, ''])
         # Add metadata with timestamp
@@ -1215,7 +1321,7 @@ def handle_replace_last_reply_click(text, state):
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
-    return [history, html, ""]
+    return [history, html, {"text": "", "files": []}]
 
 
 def handle_send_dummy_message_click(text, state):
@@ -1223,7 +1329,7 @@ def handle_send_dummy_message_click(text, state):
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
-    return [history, html, ""]
+    return [history, html, {"text": "", "files": []}]
 
 
 def handle_send_dummy_reply_click(text, state):
@@ -1231,7 +1337,7 @@ def handle_send_dummy_reply_click(text, state):
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
-    return [history, html, ""]
+    return [history, html, {"text": "", "files": []}]
 
 
 def handle_remove_last_click(state):
@@ -1239,7 +1345,7 @@ def handle_remove_last_click(state):
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
-    return [history, html, last_input]
+    return [history, html, {"text": last_input, "files": []}]
 
 
 def handle_unique_id_select(state):
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 36b31ac5..f5e0b28f 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -338,6 +338,7 @@ remove_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="20"  height="20
 branch_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="24"  height="24"  viewBox="0 0 24 24"  fill="none"  stroke="currentColor"  stroke-width="2"  stroke-linecap="round"  stroke-linejoin="round"  class="icon icon-tabler icons-tabler-outline icon-tabler-git-branch"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M7 18m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M7 6m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M17 6m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M7 8l0 8" /><path d="M9 18h6a2 2 0 0 0 2 -2v-5" /><path d="M14 14l3 -3l3 3" /></svg>'''
 info_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
 info_svg_small = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
+attachment_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21.44 11.05l-9.19 9.19a6 6 0 0 1-8.48-8.48l9.19-9.19a4 4 0 0 1 5.66 5.66l-9.2 9.19a2 2 0 0 1-2.83-2.83l8.49-8.48"></path></svg>'''
 
 copy_button = f'<button class="footer-button footer-copy-button" title="Copy" onclick="copyToClipboard(this)">{copy_svg}</button>'
 branch_button = f'<button class="footer-button footer-branch-button" title="Branch here" onclick="branchHere(this)">{branch_svg}</button>'
@@ -357,6 +358,28 @@ def format_message_timestamp(history, role, index):
     return ""
 
 
+def format_message_attachments(history, role, index):
+    """Get formatted HTML for message attachments if available"""
+    key = f"{role}_{index}"
+    if 'metadata' in history and key in history['metadata'] and 'attachments' in history['metadata'][key]:
+        attachments = history['metadata'][key]['attachments']
+        if not attachments:
+            return ""
+
+        attachments_html = '<div class="message-attachments">'
+        for attachment in attachments:
+            attachments_html += (
+                f'<div class="attachment-box">'
+                f'<div class="attachment-icon">{attachment_svg}</div>'
+                f'<div class="attachment-name">{html.escape(attachment["name"])}</div>'
+                f'</div>'
+            )
+        attachments_html += '</div>'
+        return attachments_html
+
+    return ""
+
+
 def actions_html(history, i, info_message=""):
     return (f'<div class="message-actions">'
             f'{copy_button}'
@@ -380,6 +403,10 @@ def generate_instruct_html(history):
         user_timestamp = format_message_timestamp(history, "user", i)
         assistant_timestamp = format_message_timestamp(history, "assistant", i)
 
+        # Get attachments
+        user_attachments = format_message_attachments(history, "user", i)
+        assistant_attachments = format_message_attachments(history, "assistant", i)
+
         # Create info buttons for timestamps if they exist
         info_message_user = ""
         if user_timestamp != "":
@@ -399,6 +426,7 @@ def generate_instruct_html(history):
                 f'data-raw="{html.escape(row_internal[0], quote=True)}">'
                 f'<div class="text">'
                 f'<div class="message-body">{converted_visible[0]}</div>'
+                f'{user_attachments}'
                 f'<div class="message-actions">{copy_button}{info_message_user}</div>'
                 f'</div>'
                 f'</div>'
@@ -410,6 +438,7 @@ def generate_instruct_html(history):
             f'data-index={i}>'
             f'<div class="text">'
             f'<div class="message-body">{converted_visible[1]}</div>'
+            f'{assistant_attachments}'
             f'{actions_html(history, i, info_message_assistant)}'
             f'</div>'
             f'</div>'
@@ -442,6 +471,10 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
         user_timestamp = format_message_timestamp(history, "user", i)
         assistant_timestamp = format_message_timestamp(history, "assistant", i)
 
+        # Get attachments
+        user_attachments = format_message_attachments(history, "user", i)
+        assistant_attachments = format_message_attachments(history, "assistant", i)
+
         if converted_visible[0]:  # Don't display empty user messages
             output += (
                 f'<div class="message" '
@@ -450,6 +483,7 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
                 f'<div class="text">'
                 f'<div class="username">{name1}{user_timestamp}</div>'
                 f'<div class="message-body">{converted_visible[0]}</div>'
+                f'{user_attachments}'
                 f'<div class="message-actions">{copy_button}</div>'
                 f'</div>'
                 f'</div>'
@@ -463,6 +497,7 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
             f'<div class="text">'
             f'<div class="username">{name2}{assistant_timestamp}</div>'
             f'<div class="message-body">{converted_visible[1]}</div>'
+            f'{assistant_attachments}'
             f'{actions_html(history, i)}'
             f'</div>'
             f'</div>'
@@ -484,6 +519,10 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
         user_timestamp = format_message_timestamp(history, "user", i)
         assistant_timestamp = format_message_timestamp(history, "assistant", i)
 
+        # Get attachments
+        user_attachments = format_message_attachments(history, "user", i)
+        assistant_attachments = format_message_attachments(history, "assistant", i)
+
         # Create info buttons for timestamps if they exist
         info_message_user = ""
         if user_timestamp != "":
@@ -503,6 +542,7 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
                 f'data-raw="{html.escape(row_internal[0], quote=True)}">'
                 f'<div class="text-you">'
                 f'<div class="message-body">{converted_visible[0]}</div>'
+                f'{user_attachments}'
                 f'<div class="message-actions">{copy_button}{info_message_user}</div>'
                 f'</div>'
                 f'</div>'
@@ -514,6 +554,7 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
             f'data-index={i}>'
             f'<div class="text-bot">'
             f'<div class="message-body">{converted_visible[1]}</div>'
+            f'{assistant_attachments}'
             f'{actions_html(history, i, info_message_assistant)}'
             f'</div>'
             f'</div>'
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 513a632b..f244113c 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -54,7 +54,7 @@ def create_ui():
                         gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')
 
                     with gr.Column(scale=10, elem_id='chat-input-container'):
-                        shared.gradio['textbox'] = gr.Textbox(label='', placeholder='Send a message', elem_id='chat-input', elem_classes=['add_scrollbar'])
+                        shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], elem_id='chat-input', elem_classes=['add_scrollbar'])
                         shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls')
                         shared.gradio['typing-dots'] = gr.HTML(value='<div class="typing"><span></span><span class="dot1"></span><span class="dot2"></span></div>', label='typing', elem_id='typing-container')
 
@@ -186,7 +186,7 @@ def create_event_handlers():
 
     shared.gradio['Generate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
         lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
@@ -194,7 +194,7 @@ def create_event_handlers():
 
     shared.gradio['textbox'].submit(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
         lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index c65ab8a2..afb5f9d4 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -13,6 +13,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 3da16d3e..46c33034 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -12,6 +12,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 271b4bd0..c8e94cbd 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -12,6 +12,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 15df937c..dc403ae2 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -12,6 +12,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index bd2f8339..5c643c4c 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -12,6 +12,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 98c25649..ccabea84 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -12,6 +12,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 6e13c1d2..7e9da47f 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -12,6 +12,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 67a5cb73..fdf5cd0e 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -13,6 +13,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 2e631bf0..22d39ded 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -12,6 +12,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index 409252f6..ec9bafc6 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -4,6 +4,7 @@ jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 89adbabf..025a737e 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -4,6 +4,7 @@ jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 0b1c03fa..32644e87 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -4,6 +4,7 @@ jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index eb4319b7..bd5c1d9b 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -4,6 +4,7 @@ jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 0a60d4de..51f2b7d9 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -4,6 +4,7 @@ jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 652e9900..aad6bf5a 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -4,6 +4,7 @@ jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 6f9566ba..4c055426 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -4,6 +4,7 @@ jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index c83d61c7..3d98d1b0 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -4,6 +4,7 @@ jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index e69f3bdf..f954b8d2 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -4,6 +4,7 @@ jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich

From cc8a4fdcb114bfd068c42cea267e34daaf901a30 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 20 May 2025 21:31:18 -0700
Subject: [PATCH 118/164] Minor improvement to attachments prompt format

---
 modules/chat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index cdd50c92..715f4327 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -225,7 +225,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
                     attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
 
                 if attachments_text:
-                    enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:{attachments_text}"
+                    enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}"
 
             messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg})
 
@@ -244,7 +244,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
                     attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
 
                 if attachments_text:
-                    user_input = f"{user_input}\n\nATTACHMENTS:{attachments_text}"
+                    user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}"
 
         messages.append({"role": "user", "content": user_input})
 

From 8620d6ffe73048932594494752f82cc4a20f8f92 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 20 May 2025 21:34:07 -0700
Subject: [PATCH 119/164] Make it possible to upload multiple text files/pdfs
 at once

---
 modules/ui_chat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index f244113c..ab4b4e60 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -54,7 +54,7 @@ def create_ui():
                         gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')
 
                     with gr.Column(scale=10, elem_id='chat-input-container'):
-                        shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], elem_id='chat-input', elem_classes=['add_scrollbar'])
+                        shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar'])
                         shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls')
                         shared.gradio['typing-dots'] = gr.HTML(value='<div class="typing"><span></span><span class="dot1"></span><span class="dot2"></span></div>', label='typing', elem_id='typing-container')
 

From 0d3f85477897c2999f456713ce998b59b26a6a22 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 20 May 2025 21:40:42 -0700
Subject: [PATCH 120/164] Improve the style of thinking blocks

---
 css/main.css | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/css/main.css b/css/main.css
index 6cb99fc3..8444cae8 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1370,6 +1370,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     contain: layout;
 }
 
+.chat .message-body .thinking-content p,
+.chat .message-body .thinking-content li {
+    font-size: 14px !important;
+}
+
 /* Animation for opening thinking blocks */
 @keyframes fadeIn {
     from { opacity: 0; }

From 7f6579ab20d8fd215e81f3b766f3aa9d83066bdb Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 20 May 2025 21:49:44 -0700
Subject: [PATCH 121/164] Minor style change

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 8444cae8..d1be8eb1 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1372,7 +1372,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 .chat .message-body .thinking-content p,
 .chat .message-body .thinking-content li {
-    font-size: 14px !important;
+    font-size: 15px !important;
 }
 
 /* Animation for opening thinking blocks */

From bae1aa34aa020aa749f942708b96e28e2b85c4a4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 25 May 2025 17:19:26 -0700
Subject: [PATCH 122/164] Fix loading `Llama-3_3-Nemotron-Super-49B-v1` and
 similar models (closes #7012)

---
 modules/models_settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/models_settings.py b/modules/models_settings.py
index df5a8e8d..c914bdea 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -335,7 +335,7 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
         if key.endswith('.block_count'):
             n_layers = value
         elif key.endswith('.attention.head_count_kv'):
-            n_kv_heads = value
+            n_kv_heads = max(value) if isinstance(value, list) else value
         elif key.endswith('.embedding_length'):
             embedding_dim = value
 

From 73bfc936a078ce428cc10b590a83e0391b6aed58 Mon Sep 17 00:00:00 2001
From: djholtby <djholtby@gmail.com>
Date: Mon, 26 May 2025 21:39:03 -0400
Subject: [PATCH 123/164] Close response generator when stopping API generation
 (#7014)

---
 extensions/openai/script.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index b6abae20..24bcd69d 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -125,6 +125,7 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
                         yield {"data": json.dumps(resp)}
                 finally:
                     stop_everything_event()
+                    response.close()
                     return
 
         return EventSourceResponse(generator())  # SSE streaming
@@ -157,6 +158,7 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
                         yield {"data": json.dumps(resp)}
                 finally:
                     stop_everything_event()
+                    response.close()
                     return
 
         return EventSourceResponse(generator())  # SSE streaming

From 8531100109ecc4a5bed41cc2f3adaddf9d7157f8 Mon Sep 17 00:00:00 2001
From: Underscore <47636331+Th-Underscore@users.noreply.github.com>
Date: Mon, 26 May 2025 21:40:09 -0400
Subject: [PATCH 124/164] Fix textbox text usage in methods (#7009)

---
 modules/chat.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 715f4327..36a07836 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -708,8 +708,9 @@ def send_last_reply_to_input(history):
         return ''
 
 
-def replace_last_reply(text, state):
+def replace_last_reply(textbox, state):
     history = state['history']
+    text = textbox['text']
 
     # Initialize metadata if not present
     if 'metadata' not in history:
@@ -726,8 +727,9 @@ def replace_last_reply(text, state):
     return history
 
 
-def send_dummy_message(text, state):
+def send_dummy_message(textbox, state):
     history = state['history']
+    text = textbox['text']
 
     # Initialize metadata if not present
     if 'metadata' not in history:
@@ -741,8 +743,9 @@ def send_dummy_message(text, state):
     return history
 
 
-def send_dummy_reply(text, state):
+def send_dummy_reply(textbox, state):
     history = state['history']
+    text = textbox['text']
 
     # Initialize metadata if not present
     if 'metadata' not in history:

From cc9b7253c1216e5340da85cba9b65a13cf3526e9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 26 May 2025 23:13:10 -0300
Subject: [PATCH 125/164] Update transformers requirement in /requirements/full
 (#7017)

---
 requirements/full/requirements.txt                 | 2 +-
 requirements/full/requirements_amd.txt             | 2 +-
 requirements/full/requirements_amd_noavx2.txt      | 2 +-
 requirements/full/requirements_apple_intel.txt     | 2 +-
 requirements/full/requirements_apple_silicon.txt   | 2 +-
 requirements/full/requirements_cpu_only.txt        | 2 +-
 requirements/full/requirements_cpu_only_noavx2.txt | 2 +-
 requirements/full/requirements_noavx2.txt          | 2 +-
 requirements/full/requirements_nowheels.txt        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index afb5f9d4..3d18f5fd 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -21,7 +21,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.50.*
+transformers==4.52.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 46c33034..82b19964 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.50.*
+transformers==4.52.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index c8e94cbd..a8b03014 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.50.*
+transformers==4.52.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index dc403ae2..5a61ac7d 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.50.*
+transformers==4.52.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 5c643c4c..6862c3b4 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.50.*
+transformers==4.52.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index ccabea84..e6982779 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.50.*
+transformers==4.52.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 7e9da47f..97bff786 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.50.*
+transformers==4.52.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index fdf5cd0e..17c7e246 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -21,7 +21,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.50.*
+transformers==4.52.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 22d39ded..89b32caf 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.50.*
+transformers==4.52.*
 tqdm
 wandb
 

From 355b5f6c8b5552ccdae1aa363931724306bdbb16 Mon Sep 17 00:00:00 2001
From: Underscore <47636331+Th-Underscore@users.noreply.github.com>
Date: Tue, 27 May 2025 21:54:18 -0400
Subject: [PATCH 126/164] UI: Add message version navigation (#6947)

---------

Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com>
---
 css/main.css              | 41 ++++++++++++++++-
 js/global_scope_js.js     | 38 ++++++++++++++++
 js/main.js                | 93 ++++++++++++++++++++++++++++++++++++++-
 modules/chat.py           | 59 +++++++++++++++++++++++--
 modules/html_generator.py | 27 +++++++++++-
 modules/ui.py             |  2 +
 modules/ui_chat.py        | 10 +++++
 7 files changed, 262 insertions(+), 8 deletions(-)

diff --git a/css/main.css b/css/main.css
index d1be8eb1..be27544c 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1260,7 +1260,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     position: absolute;
     bottom: -23px;
     left: 0;
-    display: flex;   
+    display: flex;
     gap: 5px;
     opacity: 0;
     transition: opacity 0.2s;
@@ -1456,3 +1456,42 @@ strong {
 .dark .attachment-icon {
     color: #ccc;
 }
+
+/* --- Simple Version Navigation --- */
+.version-navigation {
+    position: absolute;
+    bottom: -23px;
+    right: 0;
+    display: flex;
+    align-items: center;
+    gap: 5px;
+    opacity: 0;
+    transition: opacity 0.2s;
+}
+
+.message:hover .version-navigation,
+.user-message:hover .version-navigation,
+.assistant-message:hover .version-navigation {
+    opacity: 1;
+}
+
+.version-nav-button {
+    padding: 2px 6px;
+    font-size: 12px;
+    min-width: auto;
+}
+
+.version-nav-button[disabled] {
+    opacity: 0.3;
+    cursor: not-allowed;
+}
+
+.version-position {
+    font-size: 11px;
+    color: currentColor;
+    font-family: monospace;
+    min-width: 35px;
+    text-align: center;
+    opacity: 0.8;
+    user-select: none;
+}
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 285d82f9..9174622e 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -49,6 +49,44 @@ function branchHere(element) {
 
 }
 
+function navigateVersion(element, direction) {
+  if (!element) return;
+
+  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  if (!messageElement) return;
+
+  const index = messageElement.getAttribute("data-index");
+  if (!index) return;
+
+  const indexInput = document.getElementById("Navigate-message-index").querySelector("input");
+  if (!indexInput) {
+    console.error("Element with ID 'Navigate-message-index' not found.");
+    return;
+  }
+
+  const directionInput = document.getElementById("Navigate-direction").querySelector("textarea");
+  if (!directionInput) {
+    console.error("Element with ID 'Navigate-direction' not found.");
+    return;
+  }
+
+  const navigateButton = document.getElementById("Navigate-version");
+  if (!navigateButton) {
+    console.error("Required element 'Navigate-version' not found.");
+    return;
+  }
+
+  indexInput.value = index;
+  directionInput.value = direction;
+
+  // Trigger any 'change' or 'input' events Gradio might be listening for
+  const event = new Event("input", { bubbles: true });
+  indexInput.dispatchEvent(event);
+  directionInput.dispatchEvent(event);
+
+  navigateButton.click();
+}
+
 function regenerateClick() {
   document.getElementById("Regenerate").click();
 }
diff --git a/js/main.js b/js/main.js
index 01c346a7..d90e8ade 100644
--- a/js/main.js
+++ b/js/main.js
@@ -39,9 +39,24 @@ document.querySelector(".header_bar").addEventListener("click", function(event)
 //------------------------------------------------
 // Keyboard shortcuts
 //------------------------------------------------
+
+// --- Helper functions --- //
+function isModifiedKeyboardEvent() {
+  return (event instanceof KeyboardEvent &&
+    event.shiftKey ||
+    event.ctrlKey ||
+    event.altKey ||
+    event.metaKey);
+}
+
+function isFocusedOnEditableTextbox() {
+  if (event.target.tagName === "INPUT" || event.target.tagName === "TEXTAREA") {
+    return !!event.target.value;
+  }
+}
+
 let previousTabId = "chat-tab-button";
 document.addEventListener("keydown", function(event) {
-
   // Stop generation on Esc pressed
   if (event.key === "Escape") {
     // Find the element with id 'stop' and click it
@@ -49,10 +64,15 @@ document.addEventListener("keydown", function(event) {
     if (stopButton) {
       stopButton.click();
     }
+    return;
+  }
+
+  if (!document.querySelector("#chat-tab").checkVisibility() ) {
+    return;
   }
 
   // Show chat controls on Ctrl + S
-  else if (event.ctrlKey && event.key == "s") {
+  if (event.ctrlKey && event.key == "s") {
     event.preventDefault();
 
     var showControlsElement = document.getElementById("show-controls");
@@ -100,6 +120,23 @@ document.addEventListener("keydown", function(event) {
     document.getElementById("Impersonate").click();
   }
 
+  // --- Simple version navigation --- //
+  if (!isFocusedOnEditableTextbox()) {
+    // Version navigation on Arrow keys (horizontal)
+    if (!isModifiedKeyboardEvent() && event.key === "ArrowLeft") {
+      event.preventDefault();
+      navigateLastAssistantMessage("left");
+    }
+
+    else if (!isModifiedKeyboardEvent() && event.key === "ArrowRight") {
+      event.preventDefault();
+      if (!navigateLastAssistantMessage("right")) {
+        // If can't navigate right (last version), regenerate
+        document.getElementById("Regenerate").click();
+      }
+    }
+  }
+
 });
 
 //------------------------------------------------
@@ -789,3 +826,55 @@ function createMobileTopBar() {
 }
 
 createMobileTopBar();
+
+//------------------------------------------------
+// Simple Navigation Functions
+//------------------------------------------------
+
+function navigateLastAssistantMessage(direction) {
+  const chat = document.querySelector("#chat");
+  if (!chat) return false;
+
+  const messages = chat.querySelectorAll("[data-index]");
+  if (messages.length === 0) return false;
+
+  // Find the last assistant message (starting from the end)
+  let lastAssistantMessage = null;
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (
+      msg.classList.contains("assistant-message") ||
+      msg.querySelector(".circle-bot") ||
+      msg.querySelector(".text-bot")
+    ) {
+      lastAssistantMessage = msg;
+      break;
+    }
+  }
+
+  if (!lastAssistantMessage) return false;
+
+  const buttons = lastAssistantMessage.querySelectorAll(".version-nav-button");
+
+  for (let i = 0; i < buttons.length; i++) {
+    const button = buttons[i];
+    const onclick = button.getAttribute("onclick");
+    const disabled = button.hasAttribute("disabled");
+
+    const isLeft = onclick && onclick.includes("'left'");
+    const isRight = onclick && onclick.includes("'right'");
+
+    if (!disabled) {
+      if (direction === "left" && isLeft) {
+        navigateVersion(button, direction);
+        return true;
+      }
+      if (direction === "right" && isRight) {
+        navigateVersion(button, direction);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
diff --git a/modules/chat.py b/modules/chat.py
index 36a07836..6eed47ee 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -414,10 +414,20 @@ def add_message_version(history, row_idx, is_current=True):
     if "versions" not in history['metadata'][key]:
         history['metadata'][key]["versions"] = []
 
+    # Check if this version already exists
+    current_content = history['internal'][row_idx][1]
+    current_visible = history['visible'][row_idx][1]
+
+    for i, version in enumerate(history['metadata'][key]["versions"]):
+        if version['content'] == current_content and version['visible_content'] == current_visible:
+            if is_current:
+                history['metadata'][key]["current_version_index"] = i
+            return
+
     # Add current message as a version
     history['metadata'][key]["versions"].append({
-        "content": history['internal'][row_idx][1],
-        "visible_content": history['visible'][row_idx][1],
+        "content": current_content,
+        "visible_content": current_visible,
         "timestamp": get_current_timestamp()
     })
 
@@ -540,8 +550,9 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         if regenerate:
             row_idx = len(output['internal']) - 1
 
-            # Store the existing response as a version before regenerating
-            add_message_version(output, row_idx, is_current=False)
+            # Store the first response as a version before regenerating
+            if not output['metadata'].get(f"assistant_{row_idx}", {}).get('versions'):
+                add_message_version(output, row_idx, is_current=False)
 
             if loading_message:
                 yield {
@@ -1414,6 +1425,46 @@ def handle_branch_chat_click(state):
     return [history, html, past_chats_update, -1]
 
 
+def handle_navigate_version_click(state):
+    history = state['history']
+    message_index = int(state['navigate_message_index'])
+    direction = state['navigate_direction']
+
+    # Get assistant message metadata
+    key = f"assistant_{message_index}"
+    if key not in history['metadata'] or 'versions' not in history['metadata'][key]:
+        # No versions to navigate
+        html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+        return [history, html]
+
+    metadata = history['metadata'][key]
+    current_idx = metadata.get('current_version_index', 0)
+    versions = metadata['versions']
+
+    # Calculate new index
+    if direction == 'left':
+        new_idx = max(0, current_idx - 1)
+    else:  # right
+        new_idx = min(len(versions) - 1, current_idx + 1)
+
+    if new_idx == current_idx:
+        # No change needed
+        html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+        return [history, html]
+
+    # Update history with new version
+    version = versions[new_idx]
+    history['internal'][message_index][1] = version['content']
+    history['visible'][message_index][1] = version['visible_content']
+    metadata['current_version_index'] = new_idx
+
+    # Redraw and save
+    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+    save_history(history, state['unique_id'], state['character_menu'], state['mode'])
+
+    return [history, html]
+
+
 def handle_rename_chat_click():
     return [
         gr.update(value="My New Chat"),
diff --git a/modules/html_generator.py b/modules/html_generator.py
index f5e0b28f..1dfeb445 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -380,6 +380,30 @@ def format_message_attachments(history, role, index):
     return ""
 
 
+def get_version_navigation_html(history, i):
+    """Generate simple navigation arrows for message versions"""
+    key = f"assistant_{i}"
+    metadata = history.get('metadata', {})
+
+    if key not in metadata or 'versions' not in metadata[key]:
+        return ""
+
+    versions = metadata[key]['versions']
+    current_idx = metadata[key].get('current_version_index', 0)
+
+    if len(versions) <= 1:
+        return ""
+
+    left_disabled = ' disabled' if current_idx == 0 else ''
+    right_disabled = ' disabled' if current_idx >= len(versions) - 1 else ''
+
+    left_arrow = f'<button class="footer-button version-nav-button"{left_disabled} onclick="navigateVersion(this, \'left\')" title="Previous version">&lt;</button>'
+    right_arrow = f'<button class="footer-button version-nav-button"{right_disabled} onclick="navigateVersion(this, \'right\')" title="Next version">&gt;</button>'
+    position = f'<span class="version-position">{current_idx + 1}/{len(versions)}</span>'
+
+    return f'<div class="version-navigation">{left_arrow}{position}{right_arrow}</div>'
+
+
 def actions_html(history, i, info_message=""):
     return (f'<div class="message-actions">'
             f'{copy_button}'
@@ -388,7 +412,8 @@ def actions_html(history, i, info_message=""):
             f'{remove_button if i == len(history["visible"]) - 1 else ""}'
             f'{branch_button}'
             f'{info_message}'
-            f'</div>')
+            f'</div>'
+            f'{get_version_navigation_html(history, i)}')
 
 
 def generate_instruct_html(history):
diff --git a/modules/ui.py b/modules/ui.py
index 5e8fa14e..52c095a2 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -157,6 +157,8 @@ def list_model_elements():
 
 def list_interface_input_elements():
     elements = [
+        'navigate_message_index',
+        'navigate_direction',
         'temperature',
         'dynatemp_low',
         'dynatemp_high',
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index ab4b4e60..7a9f6f76 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -97,6 +97,12 @@ def create_ui():
                 with gr.Row():
                     shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
 
+        # Hidden elements for version navigation (similar to branch)
+        with gr.Row(visible=False):
+            shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index")
+            shared.gradio['navigate_direction'] = gr.Textbox(value="", elem_id="Navigate-direction")
+            shared.gradio['navigate_version'] = gr.Button(elem_id="Navigate-version")
+
 
 def create_chat_settings_ui():
     mu = shared.args.multi_user
@@ -293,6 +299,10 @@ def create_event_handlers():
     shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'), show_progress=False)
     shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
 
+    shared.gradio['navigate_version'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.handle_navigate_version_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False)
+
     # Save/delete a character
     shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False)
     shared.gradio['delete_character'].click(lambda: gr.update(visible=True), None, gradio('character_deleter'), show_progress=False)

From 5028480ebabf26ec44778588b4fbd019cd9456ed Mon Sep 17 00:00:00 2001
From: Underscore <47636331+Th-Underscore@users.noreply.github.com>
Date: Tue, 27 May 2025 23:55:27 -0400
Subject: [PATCH 127/164] UI: Add footer buttons for editing messages (#7019)

---------

Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com>
---
 css/main.css              |  49 +++++++++++++-
 js/global_scope_js.js     | 132 +++++++++++++++++++++++++++++++++++++-
 js/main.js                |  26 ++++----
 modules/chat.py           |  80 +++++++++++++----------
 modules/html_generator.py |  44 ++++++++-----
 modules/ui.py             |   7 +-
 modules/ui_chat.py        |  18 +++---
 7 files changed, 282 insertions(+), 74 deletions(-)

diff --git a/css/main.css b/css/main.css
index be27544c..9d68ba02 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1457,6 +1457,53 @@ strong {
     color: #ccc;
 }
 
+/* Message Editing Styles */
+.editing-textarea {
+    width: 100%;
+    min-height: 200px;
+    padding: 10px;
+    border-radius: 5px;
+    border: 1px solid #ccc;
+    background-color: var(--light-theme-gray);
+    font-family: inherit;
+    font-size: inherit;
+    resize: vertical;
+}
+
+.dark .editing-textarea {
+    border: 1px solid var(--border-color-dark);
+    background-color: var(--darker-gray);
+}
+
+.editing-textarea:focus {
+    outline: none;
+    border-color: var(--selected-item-color-dark);
+}
+
+.edit-controls-container {
+    margin-top: 0;
+    display: flex;
+    gap: 8px;
+    padding-bottom: 8px;
+}
+
+.edit-control-button {
+    padding: 6px 12px;
+    border: 1px solid #ccc;
+    border-radius: 4px;
+    cursor: pointer;
+    background-color: #f8f9fa;
+    color: #212529;
+    font-size: 12px;
+    margin: 0;
+}
+
+.dark .edit-control-button {
+    border: 1px solid var(--border-color-dark);
+    background-color: var(--light-gray);
+    color: #efefef;
+}
+
 /* --- Simple Version Navigation --- */
 .version-navigation {
     position: absolute;
@@ -1488,7 +1535,7 @@ strong {
 
 .version-position {
     font-size: 11px;
-    color: currentColor;
+    color: currentcolor;
     font-family: monospace;
     min-width: 35px;
     text-align: center;
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 9174622e..0e86d450 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -1,3 +1,7 @@
+// -------------------------------------------------
+// Event handlers
+// -------------------------------------------------
+
 function copyToClipboard(element) {
   if (!element) return;
 
@@ -42,11 +46,135 @@ function branchHere(element) {
   branchIndexInput.value = index;
 
   // Trigger any 'change' or 'input' events Gradio might be listening for
-  const event = new Event("input", { bubbles: true }); // 'change' might also work
+  const event = new Event("input", { bubbles: true });
   branchIndexInput.dispatchEvent(event);
 
-  branchButton.click(); // Gradio will now pick up the 'index'
+  branchButton.click();
+}
 
+// -------------------------------------------------
+// Message Editing Functions
+// -------------------------------------------------
+
+function editHere(buttonElement) {
+  if (!buttonElement) return;
+
+  const messageElement = buttonElement.closest(".message, .user-message, .assistant-message");
+  if (!messageElement) return;
+
+  const messageBody = messageElement.querySelector(".message-body");
+  if (!messageBody) return;
+
+  // If already editing, focus the textarea
+  const existingTextarea = messageBody.querySelector(".editing-textarea");
+  if (existingTextarea) {
+    existingTextarea.focus();
+    return;
+  }
+
+  // Determine role based on message element - handle different chat modes
+  const isUserMessage = messageElement.classList.contains("user-message") ||
+                       messageElement.querySelector(".text-you") !== null ||
+                       messageElement.querySelector(".circle-you") !== null;
+
+  startEditing(messageElement, messageBody, isUserMessage);
+}
+
+function startEditing(messageElement, messageBody, isUserMessage) {
+  const rawText = messageElement.getAttribute("data-raw") || messageBody.textContent;
+  const originalHTML = messageBody.innerHTML;
+
+  // Create editing interface
+  const editingInterface = createEditingInterface(rawText);
+
+  // Replace message content
+  messageBody.innerHTML = "";
+  messageBody.appendChild(editingInterface.textarea);
+  messageBody.appendChild(editingInterface.controls);
+
+  editingInterface.textarea.focus();
+  editingInterface.textarea.setSelectionRange(rawText.length, rawText.length);
+
+  // Setup event handlers
+  setupEditingHandlers(editingInterface.textarea, messageElement, originalHTML, messageBody, isUserMessage);
+}
+
+function createEditingInterface(text) {
+  const textarea = document.createElement("textarea");
+  textarea.value = text;
+  textarea.className = "editing-textarea";
+  textarea.rows = Math.max(3, text.split("\n").length);
+
+  const controls = document.createElement("div");
+  controls.className = "edit-controls-container";
+
+  const saveButton = document.createElement("button");
+  saveButton.textContent = "Save";
+  saveButton.className = "edit-control-button";
+  saveButton.type = "button";
+
+  const cancelButton = document.createElement("button");
+  cancelButton.textContent = "Cancel";
+  cancelButton.className = "edit-control-button edit-cancel-button";
+  cancelButton.type = "button";
+
+  controls.appendChild(saveButton);
+  controls.appendChild(cancelButton);
+
+  return { textarea, controls, saveButton, cancelButton };
+}
+
+function setupEditingHandlers(textarea, messageElement, originalHTML, messageBody, isUserMessage) {
+  const saveButton = messageBody.querySelector(".edit-control-button:not(.edit-cancel-button)");
+  const cancelButton = messageBody.querySelector(".edit-cancel-button");
+
+  const submitEdit = () => {
+    const index = messageElement.getAttribute("data-index");
+    if (!index || !submitMessageEdit(index, textarea.value, isUserMessage)) {
+      cancelEdit();
+    }
+  };
+
+  const cancelEdit = () => {
+    messageBody.innerHTML = originalHTML;
+  };
+
+  // Event handlers
+  saveButton.onclick = submitEdit;
+  cancelButton.onclick = cancelEdit;
+
+  textarea.onkeydown = (e) => {
+    if (e.key === "Enter" && !e.shiftKey) {
+      e.preventDefault();
+      submitEdit();
+    } else if (e.key === "Escape") {
+      e.preventDefault();
+      cancelEdit();
+    }
+  };
+}
+
+function submitMessageEdit(index, newText, isUserMessage) {
+  const editIndexInput = document.getElementById("Edit-message-index")?.querySelector("input");
+  const editTextInput = document.getElementById("Edit-message-text")?.querySelector("textarea");
+  const editRoleInput = document.getElementById("Edit-message-role")?.querySelector("textarea");
+  const editButton = document.getElementById("Edit-message");
+
+  if (!editIndexInput || !editTextInput || !editRoleInput || !editButton) {
+    console.error("Edit elements not found");
+    return false;
+  }
+
+  editIndexInput.value = index;
+  editTextInput.value = newText;
+  editRoleInput.value = isUserMessage ? "user" : "assistant";
+
+  editIndexInput.dispatchEvent(new Event("input", { bubbles: true }));
+  editTextInput.dispatchEvent(new Event("input", { bubbles: true }));
+  editRoleInput.dispatchEvent(new Event("input", { bubbles: true }));
+
+  editButton.click();
+  return true;
 }
 
 function navigateVersion(element, direction) {
diff --git a/js/main.js b/js/main.js
index d90e8ade..fc014f66 100644
--- a/js/main.js
+++ b/js/main.js
@@ -1,3 +1,7 @@
+// ------------------------------------------------
+// Main
+// ------------------------------------------------
+
 let main_parent = document.getElementById("chat-tab").parentNode;
 let extensions = document.getElementById("extensions");
 
@@ -102,18 +106,6 @@ document.addEventListener("keydown", function(event) {
     document.getElementById("Remove-last").click();
   }
 
-  // Copy last on Ctrl + Shift + K
-  else if (event.ctrlKey && event.shiftKey && event.key === "K") {
-    event.preventDefault();
-    document.getElementById("Copy-last").click();
-  }
-
-  // Replace last on Ctrl + Shift + L
-  else if (event.ctrlKey && event.shiftKey && event.key === "L") {
-    event.preventDefault();
-    document.getElementById("Replace-last").click();
-  }
-
   // Impersonate on Ctrl + Shift + M
   else if (event.ctrlKey && event.shiftKey && event.key === "M") {
     event.preventDefault();
@@ -388,6 +380,16 @@ document.addEventListener("click", function (event) {
   }
 });
 
+document.addEventListener("dblclick", (event) => {
+  const messageElement = event.target.closest(".message, .user-message, .assistant-message");
+  if (!messageElement) return;
+
+  const editButton = messageElement.querySelector(".footer-edit-button");
+  if (editButton) {
+    editButton.click();
+  }
+});
+
 //------------------------------------------------
 // Relocate the "Show controls" checkbox
 //------------------------------------------------
diff --git a/modules/chat.py b/modules/chat.py
index 6eed47ee..9598efa7 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -712,32 +712,6 @@ def remove_last_message(history):
     return html.unescape(last[0]), history
 
 
-def send_last_reply_to_input(history):
-    if len(history['visible']) > 0:
-        return html.unescape(history['visible'][-1][1])
-    else:
-        return ''
-
-
-def replace_last_reply(textbox, state):
-    history = state['history']
-    text = textbox['text']
-
-    # Initialize metadata if not present
-    if 'metadata' not in history:
-        history['metadata'] = {}
-
-    if len(text.strip()) == 0:
-        return history
-    elif len(history['visible']) > 0:
-        row_idx = len(history['internal']) - 1
-        history['visible'][-1][1] = html.escape(text)
-        history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
-        update_message_metadata(history['metadata'], "assistant", row_idx, timestamp=get_current_timestamp())
-
-    return history
-
-
 def send_dummy_message(textbox, state):
     history = state['history']
     text = textbox['text']
@@ -1330,14 +1304,6 @@ def my_yaml_output(data):
     return result
 
 
-def handle_replace_last_reply_click(text, state):
-    history = replace_last_reply(text, state)
-    save_history(history, state['unique_id'], state['character_menu'], state['mode'])
-    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
-
-    return [history, html, {"text": "", "files": []}]
-
-
 def handle_send_dummy_message_click(text, state):
     history = send_dummy_message(text, state)
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
@@ -1425,6 +1391,52 @@ def handle_branch_chat_click(state):
     return [history, html, past_chats_update, -1]
 
 
+def handle_edit_message_click(state):
+    history = state['history']
+    message_index = int(state['edit_message_index'])
+    new_text = state['edit_message_text']
+    role = state['edit_message_role']  # "user" or "assistant"
+
+    if message_index >= len(history['internal']):
+        html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+        return [history, html_output, gr.update()]
+
+    # Use the role passed from frontend
+    is_user_msg = (role == "user")
+    role_idx = 0 if is_user_msg else 1
+
+    # For assistant messages, save the original version BEFORE updating content
+    if not is_user_msg:
+        if not history['metadata'].get(f"assistant_{message_index}", {}).get('versions'):
+            add_message_version(history, message_index, is_current=False)
+
+    # NOW update the message content
+    history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True)
+    history['visible'][message_index][role_idx] = html.escape(new_text)
+
+    # Branch if editing user message, add version if editing assistant message
+    if is_user_msg:
+        # Branch like branch-here
+        history['visible'] = history['visible'][:message_index + 1]
+        history['internal'] = history['internal'][:message_index + 1]
+        new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
+        save_history(history, new_unique_id, state['character_menu'], state['mode'])
+        histories = find_all_histories_with_first_prompts(state)
+        past_chats_update = gr.update(choices=histories, value=new_unique_id)
+        state['unique_id'] = new_unique_id
+    elif not is_user_msg:
+        # Add the new version as current
+        add_message_version(history, message_index, is_current=True)
+        past_chats_update = gr.update()
+    else:
+        past_chats_update = gr.update()
+
+    save_history(history, state['unique_id'], state['character_menu'], state['mode'])
+    html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+
+    return [history, html_output, past_chats_update]
+
+
 def handle_navigate_version_click(state):
     history = state['history']
     message_index = int(state['navigate_message_index'])
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 1dfeb445..9a93555f 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -336,12 +336,14 @@ refresh_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20"
 continue_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="20"  height="20"  viewBox="0 0 24 24"  fill="none"  stroke="currentColor"  stroke-width="2"  stroke-linecap="round"  stroke-linejoin="round"  class="icon icon-tabler icons-tabler-outline icon-tabler-player-play"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M7 4v16l13 -8z" /></svg>'''
 remove_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="20"  height="20"  viewBox="0 0 24 24"  fill="none"  stroke="currentColor"  stroke-width="2"  stroke-linecap="round"  stroke-linejoin="round"  class="icon icon-tabler icons-tabler-outline icon-tabler-trash"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M4 7l16 0" /><path d="M10 11l0 6" /><path d="M14 11l0 6" /><path d="M5 7l1 12a2 2 0 0 0 2 2h8a2 2 0 0 0 2 -2l1 -12" /><path d="M9 7v-3a1 1 0 0 1 1 -1h4a1 1 0 0 1 1 1v3" /></svg>'''
 branch_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="24"  height="24"  viewBox="0 0 24 24"  fill="none"  stroke="currentColor"  stroke-width="2"  stroke-linecap="round"  stroke-linejoin="round"  class="icon icon-tabler icons-tabler-outline icon-tabler-git-branch"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M7 18m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M7 6m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M17 6m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M7 8l0 8" /><path d="M9 18h6a2 2 0 0 0 2 -2v-5" /><path d="M14 14l3 -3l3 3" /></svg>'''
+edit_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-pencil"><path d="M4 20h4l10.5 -10.5a2.828 2.828 0 1 0 -4 -4l-10.5 10.5v4"></path><path d="M13.5 6.5l4 4"></path></svg>'''
 info_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
 info_svg_small = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
 attachment_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21.44 11.05l-9.19 9.19a6 6 0 0 1-8.48-8.48l9.19-9.19a4 4 0 0 1 5.66 5.66l-9.2 9.19a2 2 0 0 1-2.83-2.83l8.49-8.48"></path></svg>'''
 
 copy_button = f'<button class="footer-button footer-copy-button" title="Copy" onclick="copyToClipboard(this)">{copy_svg}</button>'
 branch_button = f'<button class="footer-button footer-branch-button" title="Branch here" onclick="branchHere(this)">{branch_svg}</button>'
+edit_button = f'<button class="footer-button footer-edit-button" title="Edit" onclick="editHere(this)">{edit_svg}</button>'
 refresh_button = f'<button class="footer-button footer-refresh-button" title="Regenerate" onclick="regenerateClick()">{refresh_svg}</button>'
 continue_button = f'<button class="footer-button footer-continue-button" title="Continue" onclick="continueClick()">{continue_svg}</button>'
 remove_button = f'<button class="footer-button footer-remove-button" title="Remove last reply" onclick="removeLastClick()">{remove_svg}</button>'
@@ -404,16 +406,23 @@ def get_version_navigation_html(history, i):
     return f'<div class="version-navigation">{left_arrow}{position}{right_arrow}</div>'
 
 
-def actions_html(history, i, info_message=""):
+def actions_html(history, i, role, info_message=""):
+    if role == "assistant":
+        return (f'<div class="message-actions">'
+                f'{copy_button}'
+                f'{edit_button}'
+                f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
+                f'{continue_button if i == len(history["visible"]) - 1 else ""}'
+                f'{remove_button if i == len(history["visible"]) - 1 else ""}'
+                f'{branch_button}'
+                f'{info_message}'
+                f'</div>'
+                f'{get_version_navigation_html(history, i)}')
     return (f'<div class="message-actions">'
             f'{copy_button}'
-            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
-            f'{continue_button if i == len(history["visible"]) - 1 else ""}'
-            f'{remove_button if i == len(history["visible"]) - 1 else ""}'
-            f'{branch_button}'
+            f'{edit_button}'
             f'{info_message}'
-            f'</div>'
-            f'{get_version_navigation_html(history, i)}')
+            f'</div>')
 
 
 def generate_instruct_html(history):
@@ -448,11 +457,12 @@ def generate_instruct_html(history):
         if converted_visible[0]:  # Don't display empty user messages
             output += (
                 f'<div class="user-message" '
-                f'data-raw="{html.escape(row_internal[0], quote=True)}">'
+                f'data-raw="{html.escape(row_internal[0], quote=True)}"'
+                f'data-index={i}>'
                 f'<div class="text">'
                 f'<div class="message-body">{converted_visible[0]}</div>'
                 f'{user_attachments}'
-                f'<div class="message-actions">{copy_button}{info_message_user}</div>'
+                f'{actions_html(history, i, "user", info_message_user)}'
                 f'</div>'
                 f'</div>'
             )
@@ -464,7 +474,7 @@ def generate_instruct_html(history):
             f'<div class="text">'
             f'<div class="message-body">{converted_visible[1]}</div>'
             f'{assistant_attachments}'
-            f'{actions_html(history, i, info_message_assistant)}'
+            f'{actions_html(history, i, "assistant", info_message_assistant)}'
             f'</div>'
             f'</div>'
         )
@@ -503,13 +513,14 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
         if converted_visible[0]:  # Don't display empty user messages
             output += (
                 f'<div class="message" '
-                f'data-raw="{html.escape(row_internal[0], quote=True)}">'
+                f'data-raw="{html.escape(row_internal[0], quote=True)}"'
+                f'data-index={i}>'
                 f'<div class="circle-you">{img_me}</div>'
                 f'<div class="text">'
                 f'<div class="username">{name1}{user_timestamp}</div>'
                 f'<div class="message-body">{converted_visible[0]}</div>'
                 f'{user_attachments}'
-                f'<div class="message-actions">{copy_button}</div>'
+                f'{actions_html(history, i, "user")}'
                 f'</div>'
                 f'</div>'
             )
@@ -523,7 +534,7 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
             f'<div class="username">{name2}{assistant_timestamp}</div>'
             f'<div class="message-body">{converted_visible[1]}</div>'
             f'{assistant_attachments}'
-            f'{actions_html(history, i)}'
+            f'{actions_html(history, i, "assistant")}'
             f'</div>'
             f'</div>'
         )
@@ -564,11 +575,12 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
         if converted_visible[0]:  # Don't display empty user messages
             output += (
                 f'<div class="message" '
-                f'data-raw="{html.escape(row_internal[0], quote=True)}">'
+                f'data-raw="{html.escape(row_internal[0], quote=True)}"'
+                f'data-index={i}>'
                 f'<div class="text-you">'
                 f'<div class="message-body">{converted_visible[0]}</div>'
                 f'{user_attachments}'
-                f'<div class="message-actions">{copy_button}{info_message_user}</div>'
+                f'{actions_html(history, i, "user", info_message_user)}'
                 f'</div>'
                 f'</div>'
             )
@@ -580,7 +592,7 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
             f'<div class="text-bot">'
             f'<div class="message-body">{converted_visible[1]}</div>'
             f'{assistant_attachments}'
-            f'{actions_html(history, i, info_message_assistant)}'
+            f'{actions_html(history, i, "assistant", info_message_assistant)}'
             f'</div>'
             f'</div>'
         )
diff --git a/modules/ui.py b/modules/ui.py
index 52c095a2..00393b53 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -212,7 +212,12 @@ def list_interface_input_elements():
         'negative_prompt',
         'dry_sequence_breakers',
         'grammar_string',
-        'branch_index'
+        'navigate_message_index',
+        'navigate_direction',
+        'edit_message_index',
+        'edit_message_text',
+        'edit_message_role',
+        'branch_index',
     ]
 
     # Chat elements
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 7a9f6f76..2856ce1f 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -71,8 +71,6 @@ def create_ui():
                 shared.gradio['Remove last'] = gr.Button('Remove last reply (Ctrl + Shift + Backspace)', elem_id='Remove-last')
 
             with gr.Row():
-                shared.gradio['Replace last reply'] = gr.Button('Replace last reply (Ctrl + Shift + L)', elem_id='Replace-last')
-                shared.gradio['Copy last reply'] = gr.Button('Copy last reply (Ctrl + Shift + K)', elem_id='Copy-last')
                 shared.gradio['Impersonate'] = gr.Button('Impersonate (Ctrl + Shift + M)', elem_id='Impersonate')
 
             with gr.Row():
@@ -97,11 +95,15 @@ def create_ui():
                 with gr.Row():
                     shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
 
-        # Hidden elements for version navigation (similar to branch)
+        # Hidden elements for version navigation and editing
         with gr.Row(visible=False):
             shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index")
             shared.gradio['navigate_direction'] = gr.Textbox(value="", elem_id="Navigate-direction")
             shared.gradio['navigate_version'] = gr.Button(elem_id="Navigate-version")
+            shared.gradio['edit_message_index'] = gr.Number(value=-1, precision=0, elem_id="Edit-message-index")
+            shared.gradio['edit_message_text'] = gr.Textbox(value="", elem_id="Edit-message-text")
+            shared.gradio['edit_message_role'] = gr.Textbox(value="", elem_id="Edit-message-role")
+            shared.gradio['edit_message'] = gr.Button(elem_id="Edit-message")
 
 
 def create_chat_settings_ui():
@@ -228,10 +230,6 @@ def create_event_handlers():
         None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
-    shared.gradio['Replace last reply'].click(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_replace_last_reply_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False)
-
     shared.gradio['Send dummy message'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.handle_send_dummy_message_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False)
@@ -297,12 +295,16 @@ def create_event_handlers():
         None, gradio('mode'), None, js="(mode) => {mode === 'instruct' ? document.getElementById('character-menu').parentNode.parentNode.style.display = 'none' : document.getElementById('character-menu').parentNode.parentNode.style.display = ''}")
 
     shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'), show_progress=False)
-    shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
 
     shared.gradio['navigate_version'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.handle_navigate_version_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False)
 
+    shared.gradio['edit_message'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False).then(
+        lambda: None, None, None, js='() => { const role = document.getElementById("Edit-message-role").querySelector("textarea").value; if (role === "user") document.getElementById("Regenerate").click(); }')
+
     # Save/delete a character
     shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False)
     shared.gradio['delete_character'].click(lambda: gr.update(visible=True), None, gradio('character_deleter'), show_progress=False)

From 2db36da979b539263deacbd3ac8b3f6dbba7f97f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 27 May 2025 21:00:11 -0700
Subject: [PATCH 128/164] UI: Make scrollbars more discrete in dark mode

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 9d68ba02..90dd51bc 100644
--- a/css/main.css
+++ b/css/main.css
@@ -265,7 +265,7 @@ button {
 
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb,
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
-    background: #ccc;
+    background: rgba(255, 255, 255, 0.2);
     border-radius: 10px;
 }
 

From f6ca0ee0727bceac867d5a5bbea0c6d61fea35ea Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 27 May 2025 21:20:51 -0700
Subject: [PATCH 129/164] Fix regenerate sometimes not creating a new message
 version

---
 modules/chat.py | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 9598efa7..59ca4d34 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -399,40 +399,26 @@ def get_stopping_strings(state):
 
 
 def add_message_version(history, row_idx, is_current=True):
-    """Add the current message as a version in the history metadata"""
-    if 'metadata' not in history:
-        history['metadata'] = {}
-
-    if row_idx >= len(history['internal']) or not history['internal'][row_idx][1].strip():
-        return  # Skip if row doesn't exist or message is empty
-
     key = f"assistant_{row_idx}"
-
-    # Initialize metadata structures if needed
     if key not in history['metadata']:
-        history['metadata'][key] = {"timestamp": get_current_timestamp()}
+        history['metadata'][key] = {}
+
     if "versions" not in history['metadata'][key]:
         history['metadata'][key]["versions"] = []
 
-    # Check if this version already exists
     current_content = history['internal'][row_idx][1]
     current_visible = history['visible'][row_idx][1]
 
-    for i, version in enumerate(history['metadata'][key]["versions"]):
-        if version['content'] == current_content and version['visible_content'] == current_visible:
-            if is_current:
-                history['metadata'][key]["current_version_index"] = i
-            return
-
-    # Add current message as a version
+    # Always add the current message as a new version entry.
+    # The timestamp will differentiate it even if content is identical to a previous version.
     history['metadata'][key]["versions"].append({
         "content": current_content,
         "visible_content": current_visible,
         "timestamp": get_current_timestamp()
     })
 
-    # Update index if this is the current version
     if is_current:
+        # Set the current_version_index to the newly added version (which is now the last one).
         history['metadata'][key]["current_version_index"] = len(history['metadata'][key]["versions"]) - 1
 
 

From 1b0e2d8750ee315086acb2738fab76ad28abadb8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Tue, 27 May 2025 22:36:24 -0700
Subject: [PATCH 130/164] UI: Add a token counter to the chat tab (counts input
 + history)

---
 css/main.css       |  7 ++++++
 modules/chat.py    | 54 +++++++++++++++++++++++++++++++++++++++++++++-
 modules/ui_chat.py |  9 ++++++++
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 90dd51bc..6e030453 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1542,3 +1542,10 @@ strong {
     opacity: 0.8;
     user-select: none;
 }
+
+.token-display {
+    font-family: monospace;
+    font-size: 13px;
+    color: var(--body-text-color-subdued);
+    margin-top: 4px;
+}
diff --git a/modules/chat.py b/modules/chat.py
index 59ca4d34..498c0d88 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -230,7 +230,15 @@ def generate_chat_prompt(user_input, state, **kwargs):
             messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg})
 
     user_input = user_input.strip()
-    if user_input and not impersonate and not _continue:
+
+    # Check if we have attachments even with empty input
+    has_attachments = False
+    if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
+        current_row_idx = len(history)
+        user_key = f"user_{current_row_idx}"
+        has_attachments = user_key in metadata and "attachments" in metadata[user_key]
+
+    if (user_input or has_attachments) and not impersonate and not _continue:
         # For the current user input being processed, check if we need to add attachments
         if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
             current_row_idx = len(history)
@@ -350,6 +358,50 @@ def generate_chat_prompt(user_input, state, **kwargs):
         return prompt
 
 
+def count_prompt_tokens(text_input, state):
+    """Count tokens for current history + input including attachments"""
+    if shared.tokenizer is None:
+        return "Tokenizer not available"
+
+    try:
+        # Handle dict format with text and files
+        files = []
+        if isinstance(text_input, dict):
+            files = text_input.get('files', [])
+            text = text_input.get('text', '')
+        else:
+            text = text_input
+            files = []
+
+        # Create temporary history copy to add attachments
+        temp_history = copy.deepcopy(state['history'])
+        if 'metadata' not in temp_history:
+            temp_history['metadata'] = {}
+
+        # Process attachments if any
+        if files:
+            row_idx = len(temp_history['internal'])
+            for file_path in files:
+                add_message_attachment(temp_history, row_idx, file_path, is_user=True)
+
+        # Create temp state with modified history
+        temp_state = copy.deepcopy(state)
+        temp_state['history'] = temp_history
+
+        # Build prompt using existing logic
+        prompt = generate_chat_prompt(text, temp_state)
+        current_tokens = get_encoded_length(prompt)
+        max_tokens = temp_state['truncation_length']
+
+        percentage = (current_tokens / max_tokens) * 100 if max_tokens > 0 else 0
+
+        return f"History + Input:<br/>{current_tokens:,} / {max_tokens:,} tokens ({percentage:.1f}%)"
+
+    except Exception as e:
+        logger.error(f"Error counting tokens: {e}")
+        return f"Error: {str(e)}"
+
+
 def get_stopping_strings(state):
     stopping_strings = []
     renderers = []
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 2856ce1f..952a40a5 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -95,6 +95,11 @@ def create_ui():
                 with gr.Row():
                     shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
 
+                with gr.Row():
+                    shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm')
+
+                shared.gradio['token_display'] = gr.HTML(value='', elem_classes='token-display')
+
         # Hidden elements for version navigation and editing
         with gr.Row(visible=False):
             shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index")
@@ -360,3 +365,7 @@ def create_event_handlers():
         None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
 
     shared.gradio['show_controls'].change(None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
+
+    shared.gradio['count_tokens'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.count_prompt_tokens, gradio('textbox', 'interface_state'), gradio('token_display'), show_progress=False)

From 077bbc6b101f8f6045b95369bc82373187741d12 Mon Sep 17 00:00:00 2001
From: oobabooga <oobabooga4@gmail.com>
Date: Wed, 28 May 2025 04:27:28 -0300
Subject: [PATCH 131/164] Add web search support (#7023)

---
 modules/chat.py                               |   4 +
 modules/ui.py                                 |   6 +-
 modules/ui_chat.py                            |  12 ++
 modules/web_search.py                         | 125 ++++++++++++++++++
 requirements/full/requirements.txt            |   2 +
 requirements/full/requirements_amd.txt        |   2 +
 requirements/full/requirements_amd_noavx2.txt |   2 +
 .../full/requirements_apple_intel.txt         |   2 +
 .../full/requirements_apple_silicon.txt       |   2 +
 requirements/full/requirements_cpu_only.txt   |   2 +
 .../full/requirements_cpu_only_noavx2.txt     |   2 +
 requirements/full/requirements_noavx2.txt     |   2 +
 requirements/full/requirements_nowheels.txt   |   2 +
 requirements/portable/requirements.txt        |   2 +
 .../portable/requirements_apple_intel.txt     |   2 +
 .../portable/requirements_apple_silicon.txt   |   2 +
 .../portable/requirements_cpu_only.txt        |   2 +
 .../portable/requirements_cpu_only_noavx2.txt |   2 +
 requirements/portable/requirements_noavx2.txt |   2 +
 .../portable/requirements_nowheels.txt        |   2 +
 requirements/portable/requirements_vulkan.txt |   2 +
 .../portable/requirements_vulkan_noavx2.txt   |   2 +
 22 files changed, 181 insertions(+), 2 deletions(-)
 create mode 100644 modules/web_search.py

diff --git a/modules/chat.py b/modules/chat.py
index 498c0d88..b2aacd5c 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -31,6 +31,7 @@ from modules.text_generation import (
     get_max_prompt_length
 )
 from modules.utils import delete_file, get_available_characters, save_file
+from modules.web_search import add_web_search_attachments
 
 
 def strftime_now(format):
@@ -566,6 +567,9 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         for file_path in files:
             add_message_attachment(output, row_idx, file_path, is_user=True)
 
+        # Add web search results as attachments if enabled
+        add_web_search_attachments(output, row_idx, text, state)
+
         # Apply extensions
         text, visible_text = apply_extensions('chat_input', text, visible_text, state)
         text = apply_extensions('input', text, state, is_chat=True)
diff --git a/modules/ui.py b/modules/ui.py
index 00393b53..e24e6402 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -157,8 +157,6 @@ def list_model_elements():
 
 def list_interface_input_elements():
     elements = [
-        'navigate_message_index',
-        'navigate_direction',
         'temperature',
         'dynatemp_low',
         'dynatemp_high',
@@ -218,6 +216,10 @@ def list_interface_input_elements():
         'edit_message_text',
         'edit_message_role',
         'branch_index',
+        'enable_web_search',
+        'web_search_pages',
+        'navigate_message_index',
+        'navigate_direction',
     ]
 
     # Chat elements
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 952a40a5..719af85a 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -86,6 +86,12 @@ def create_ui():
                 with gr.Row():
                     shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
 
+                with gr.Row():
+                    shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search')
+
+                with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']:
+                    shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10)
+
                 with gr.Row():
                     shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
 
@@ -369,3 +375,9 @@ def create_event_handlers():
     shared.gradio['count_tokens'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.count_prompt_tokens, gradio('textbox', 'interface_state'), gradio('token_display'), show_progress=False)
+
+    shared.gradio['enable_web_search'].change(
+        lambda x: gr.update(visible=x),
+        gradio('enable_web_search'),
+        gradio('web_search_row')
+    )
diff --git a/modules/web_search.py b/modules/web_search.py
new file mode 100644
index 00000000..e7688ba4
--- /dev/null
+++ b/modules/web_search.py
@@ -0,0 +1,125 @@
+from datetime import datetime
+
+import requests
+from bs4 import BeautifulSoup
+from duckduckgo_search import DDGS
+
+from modules.logging_colors import logger
+from modules.text_generation import generate_reply
+
+
+def get_current_timestamp():
+    """Returns the current time in 24-hour format"""
+    return datetime.now().strftime('%b %d, %Y %H:%M')
+
+
+def generate_search_query(user_message, state):
+    """Generate a search query from user message using the LLM"""
+    search_prompt = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else."
+
+    # Use a minimal state for search query generation
+    search_state = state.copy()
+    search_state['max_new_tokens'] = 64
+    search_state['temperature'] = 0.1
+
+    query = ""
+    for reply in generate_reply(search_prompt, search_state, stopping_strings=[], is_chat=False):
+        query = reply.strip()
+
+    return query
+
+
+def download_web_page(url, timeout=10):
+    """Download and extract text from a web page"""
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=timeout)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.content, 'html.parser')
+
+        # Remove script and style elements
+        for script in soup(["script", "style"]):
+            script.decompose()
+
+        # Get text and clean it up
+        text = soup.get_text()
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = ' '.join(chunk for chunk in chunks if chunk)
+
+        return text
+    except Exception as e:
+        logger.error(f"Error downloading {url}: {e}")
+        return f"[Error downloading content from {url}: {str(e)}]"
+
+
+def perform_web_search(query, num_pages=3):
+    """Perform web search and return results with content"""
+    try:
+        with DDGS() as ddgs:
+            results = list(ddgs.text(query, max_results=num_pages))
+
+        search_results = []
+        for i, result in enumerate(results):
+            url = result.get('href', '')
+            title = result.get('title', f'Search Result {i+1}')
+
+            # Download page content
+            content = download_web_page(url)
+
+            search_results.append({
+                'title': title,
+                'url': url,
+                'content': content
+            })
+
+        return search_results
+    except Exception as e:
+        logger.error(f"Error performing web search: {e}")
+        return []
+
+
+def add_web_search_attachments(history, row_idx, user_message, state):
+    """Perform web search and add results as attachments"""
+    if not state.get('enable_web_search', False):
+        return
+
+    try:
+        # Generate search query
+        search_query = generate_search_query(user_message, state)
+        if not search_query:
+            logger.warning("Failed to generate search query")
+            return
+
+        logger.info(f"Generated search query: {search_query}")
+
+        # Perform web search
+        num_pages = int(state.get('web_search_pages', 3))
+        search_results = perform_web_search(search_query, num_pages)
+
+        if not search_results:
+            logger.warning("No search results found")
+            return
+
+        # Add search results as attachments
+        key = f"user_{row_idx}"
+        if key not in history['metadata']:
+            history['metadata'][key] = {"timestamp": get_current_timestamp()}
+        if "attachments" not in history['metadata'][key]:
+            history['metadata'][key]["attachments"] = []
+
+        for result in search_results:
+            attachment = {
+                "name": f"{result['title']}",
+                "type": "text/html",
+                "content": f"URL: {result['url']}\n\n{result['content']}"
+            }
+            history['metadata'][key]["attachments"].append(attachment)
+
+        logger.info(f"Added {len(search_results)} web search results as attachments")
+
+    except Exception as e:
+        logger.error(f"Error in web search: {e}")
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 3d18f5fd..0eaf10da 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -1,7 +1,9 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 bitsandbytes==0.45.*
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 82b19964..65f184bf 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index a8b03014..d20b2ec3 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 5a61ac7d..2613d787 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 6862c3b4..af583b00 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index e6982779..9bf2a37d 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 97bff786..1731448e 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index 17c7e246..fc481a1a 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -1,7 +1,9 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 bitsandbytes==0.45.*
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 89b32caf..2ed8affa 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index ec9bafc6..fdae681d 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -1,3 +1,5 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 025a737e..a58f39f7 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -1,3 +1,5 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 32644e87..91ea3a6d 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -1,3 +1,5 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index bd5c1d9b..37e5aa40 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -1,3 +1,5 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 51f2b7d9..dcb2884b 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -1,3 +1,5 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index aad6bf5a..8f1295bb 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -1,3 +1,5 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 4c055426..21805fe2 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -1,3 +1,5 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 3d98d1b0..858b4488 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -1,3 +1,5 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index f954b8d2..569bae99 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -1,3 +1,5 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6

From 75c6ae8502cae60bd8dabef1e2af4aec5766ca35 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 00:29:17 -0700
Subject: [PATCH 132/164] UI: Don't edit messages on double click

---
 js/main.js | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/js/main.js b/js/main.js
index fc014f66..48bb8632 100644
--- a/js/main.js
+++ b/js/main.js
@@ -380,16 +380,6 @@ document.addEventListener("click", function (event) {
   }
 });
 
-document.addEventListener("dblclick", (event) => {
-  const messageElement = event.target.closest(".message, .user-message, .assistant-message");
-  if (!messageElement) return;
-
-  const editButton = messageElement.querySelector(".footer-edit-button");
-  if (editButton) {
-    editButton.click();
-  }
-});
-
 //------------------------------------------------
 // Relocate the "Show controls" checkbox
 //------------------------------------------------

From 0aedb8992165b386dac244baeb5fb5967513869e Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 00:35:20 -0700
Subject: [PATCH 133/164] UI: Small style improvement to attachments

---
 css/main.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/css/main.css b/css/main.css
index 6e030453..181a19b8 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1417,6 +1417,7 @@ strong {
     flex-wrap: wrap;
     gap: 8px;
     margin-top: 8px;
+    padding-bottom: 6px;
 }
 
 .attachment-box {

From 6c3590ba9ab0bd540097a50986a59f0099d11d92 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 05:28:15 -0700
Subject: [PATCH 134/164] Make web search attachments clickable

---
 modules/html_generator.py | 8 +++++++-
 modules/web_search.py     | 5 +++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/modules/html_generator.py b/modules/html_generator.py
index 9a93555f..bfb278cd 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -370,10 +370,16 @@ def format_message_attachments(history, role, index):
 
         attachments_html = '<div class="message-attachments">'
         for attachment in attachments:
+            name = html.escape(attachment["name"])
+
+            # Make clickable if URL exists
+            if "url" in attachment:
+                name = f'<a href="{html.escape(attachment["url"])}" target="_blank" rel="noopener noreferrer">{name}</a>'
+
             attachments_html += (
                 f'<div class="attachment-box">'
                 f'<div class="attachment-icon">{attachment_svg}</div>'
-                f'<div class="attachment-name">{html.escape(attachment["name"])}</div>'
+                f'<div class="attachment-name">{name}</div>'
                 f'</div>'
             )
         attachments_html += '</div>'
diff --git a/modules/web_search.py b/modules/web_search.py
index e7688ba4..d3387ac9 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -113,9 +113,10 @@ def add_web_search_attachments(history, row_idx, user_message, state):
 
         for result in search_results:
             attachment = {
-                "name": f"{result['title']}",
+                "name": result['title'],
                 "type": "text/html",
-                "content": f"URL: {result['url']}\n\n{result['content']}"
+                "url": result['url'],
+                "content": result['content']
             }
             history['metadata'][key]["attachments"].append(attachment)
 

From 27641ac1823751165615a1a53b62ae24977e37a0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 17:09:05 -0700
Subject: [PATCH 135/164] UI: Make message editing work the same for user and
 assistant messages

---
 js/global_scope_js.js     | 28 ++++++------
 modules/chat.py           | 94 ++++++++++++++++++++-------------------
 modules/html_generator.py | 42 ++++++++++-------
 modules/ui.py             |  3 +-
 modules/ui_chat.py        |  4 +-
 5 files changed, 94 insertions(+), 77 deletions(-)

diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 0e86d450..3274f47e 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -186,31 +186,33 @@ function navigateVersion(element, direction) {
   const index = messageElement.getAttribute("data-index");
   if (!index) return;
 
-  const indexInput = document.getElementById("Navigate-message-index").querySelector("input");
-  if (!indexInput) {
-    console.error("Element with ID 'Navigate-message-index' not found.");
-    return;
-  }
-
-  const directionInput = document.getElementById("Navigate-direction").querySelector("textarea");
-  if (!directionInput) {
-    console.error("Element with ID 'Navigate-direction' not found.");
-    return;
+  // Determine role based on message element classes
+  let role = "assistant"; // Default role
+  if (messageElement.classList.contains("user-message") ||
+      messageElement.querySelector(".text-you") ||
+      messageElement.querySelector(".circle-you")) {
+    role = "user";
   }
 
+  const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input");
+  const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea");
+  const roleInput = document.getElementById("Navigate-message-role")?.querySelector("textarea");
   const navigateButton = document.getElementById("Navigate-version");
-  if (!navigateButton) {
-    console.error("Required element 'Navigate-version' not found.");
+
+  if (!indexInput || !directionInput || !roleInput || !navigateButton) {
+    console.error("Navigation control elements (index, direction, role, or button) not found.");
     return;
   }
 
   indexInput.value = index;
   directionInput.value = direction;
+  roleInput.value = role;
 
-  // Trigger any 'change' or 'input' events Gradio might be listening for
+  // Trigger 'input' events for Gradio to pick up changes
   const event = new Event("input", { bubbles: true });
   indexInput.dispatchEvent(event);
   directionInput.dispatchEvent(event);
+  roleInput.dispatchEvent(event);
 
   navigateButton.click();
 }
diff --git a/modules/chat.py b/modules/chat.py
index b2aacd5c..8bac680c 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -451,19 +451,21 @@ def get_stopping_strings(state):
     return result
 
 
-def add_message_version(history, row_idx, is_current=True):
-    key = f"assistant_{row_idx}"
+def add_message_version(history, role, row_idx, is_current=True):
+    key = f"{role}_{row_idx}"
+    if 'metadata' not in history:
+        history['metadata'] = {}
     if key not in history['metadata']:
         history['metadata'][key] = {}
 
     if "versions" not in history['metadata'][key]:
         history['metadata'][key]["versions"] = []
 
-    current_content = history['internal'][row_idx][1]
-    current_visible = history['visible'][row_idx][1]
+    # Determine which index to use for content based on role
+    content_idx = 0 if role == 'user' else 1
+    current_content = history['internal'][row_idx][content_idx]
+    current_visible = history['visible'][row_idx][content_idx]
 
-    # Always add the current message as a new version entry.
-    # The timestamp will differentiate it even if content is identical to a previous version.
     history['metadata'][key]["versions"].append({
         "content": current_content,
         "visible_content": current_visible,
@@ -594,7 +596,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
 
             # Store the first response as a version before regenerating
             if not output['metadata'].get(f"assistant_{row_idx}", {}).get('versions'):
-                add_message_version(output, row_idx, is_current=False)
+                add_message_version(output, "assistant", row_idx, is_current=False)
 
             if loading_message:
                 yield {
@@ -656,12 +658,13 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             if is_stream:
                 yield output
 
+    output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
+
     # Add the newly generated response as a version (only for regeneration)
     if regenerate:
         row_idx = len(output['internal']) - 1
-        add_message_version(output, row_idx, is_current=True)
+        add_message_version(output, "assistant", row_idx, is_current=True)
 
-    output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
     yield output
 
 
@@ -1441,37 +1444,35 @@ def handle_edit_message_click(state):
 
     if message_index >= len(history['internal']):
         html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
-        return [history, html_output, gr.update()]
+        return [history, html_output, gr.update()]  # No unique_id change
 
-    # Use the role passed from frontend
-    is_user_msg = (role == "user")
-    role_idx = 0 if is_user_msg else 1
+    role_idx = 0 if role == "user" else 1
 
-    # For assistant messages, save the original version BEFORE updating content
-    if not is_user_msg:
-        if not history['metadata'].get(f"assistant_{message_index}", {}).get('versions'):
-            add_message_version(history, message_index, is_current=False)
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
+    key = f"{role}_{message_index}"
+    if key not in history['metadata']:
+        history['metadata'][key] = {}
+
+    # If no versions exist yet for this message, store the current (pre-edit) content as the first version.
+    if "versions" not in history['metadata'][key] or not history['metadata'][key]["versions"]:
+        original_content = history['internal'][message_index][role_idx]
+        original_visible = history['visible'][message_index][role_idx]
+
+        history['metadata'][key]["versions"] = [{
+            "content": original_content,
+            "visible_content": original_visible,
+            "timestamp": get_current_timestamp()
+        }]
 
-    # NOW update the message content
     history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True)
     history['visible'][message_index][role_idx] = html.escape(new_text)
 
-    # Branch if editing user message, add version if editing assistant message
-    if is_user_msg:
-        # Branch like branch-here
-        history['visible'] = history['visible'][:message_index + 1]
-        history['internal'] = history['internal'][:message_index + 1]
-        new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
-        save_history(history, new_unique_id, state['character_menu'], state['mode'])
-        histories = find_all_histories_with_first_prompts(state)
-        past_chats_update = gr.update(choices=histories, value=new_unique_id)
-        state['unique_id'] = new_unique_id
-    elif not is_user_msg:
-        # Add the new version as current
-        add_message_version(history, message_index, is_current=True)
-        past_chats_update = gr.update()
-    else:
-        past_chats_update = gr.update()
+    add_message_version(history, role, message_index, is_current=True)
+
+    # Since we are not branching, unique_id does not change.
+    past_chats_update = gr.update()
 
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
     html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
@@ -1483,33 +1484,36 @@ def handle_navigate_version_click(state):
     history = state['history']
     message_index = int(state['navigate_message_index'])
     direction = state['navigate_direction']
+    role = state['navigate_message_role']
 
-    # Get assistant message metadata
-    key = f"assistant_{message_index}"
-    if key not in history['metadata'] or 'versions' not in history['metadata'][key]:
-        # No versions to navigate
+    if not role:
+        logger.error("Role not provided for version navigation.")
+        html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+        return [history, html]
+
+    key = f"{role}_{message_index}"
+    if 'metadata' not in history or key not in history['metadata'] or 'versions' not in history['metadata'][key]:
         html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
         return [history, html]
 
     metadata = history['metadata'][key]
-    current_idx = metadata.get('current_version_index', 0)
     versions = metadata['versions']
+    # Default to the last version if current_version_index is not set
+    current_idx = metadata.get('current_version_index', len(versions) - 1 if versions else 0)
 
-    # Calculate new index
     if direction == 'left':
         new_idx = max(0, current_idx - 1)
     else:  # right
         new_idx = min(len(versions) - 1, current_idx + 1)
 
     if new_idx == current_idx:
-        # No change needed
         html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
         return [history, html]
 
-    # Update history with new version
-    version = versions[new_idx]
-    history['internal'][message_index][1] = version['content']
-    history['visible'][message_index][1] = version['visible_content']
+    msg_content_idx = 0 if role == 'user' else 1  # 0 for user content, 1 for assistant content in the pair
+    version_to_load = versions[new_idx]
+    history['internal'][message_index][msg_content_idx] = version_to_load['content']
+    history['visible'][message_index][msg_content_idx] = version_to_load['visible_content']
     metadata['current_version_index'] = new_idx
 
     # Redraw and save
diff --git a/modules/html_generator.py b/modules/html_generator.py
index bfb278cd..cbf3e19c 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -388,16 +388,17 @@ def format_message_attachments(history, role, index):
     return ""
 
 
-def get_version_navigation_html(history, i):
+def get_version_navigation_html(history, i, role):
     """Generate simple navigation arrows for message versions"""
-    key = f"assistant_{i}"
+    key = f"{role}_{i}"
     metadata = history.get('metadata', {})
 
     if key not in metadata or 'versions' not in metadata[key]:
         return ""
 
     versions = metadata[key]['versions']
-    current_idx = metadata[key].get('current_version_index', 0)
+    # Default to the last version if current_version_index isn't set in metadata
+    current_idx = metadata[key].get('current_version_index', len(versions) - 1 if versions else 0)
 
     if len(versions) <= 1:
         return ""
@@ -413,22 +414,33 @@ def get_version_navigation_html(history, i):
 
 
 def actions_html(history, i, role, info_message=""):
+    action_buttons = ""
+    version_nav_html = ""
+
     if role == "assistant":
-        return (f'<div class="message-actions">'
-                f'{copy_button}'
-                f'{edit_button}'
-                f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
-                f'{continue_button if i == len(history["visible"]) - 1 else ""}'
-                f'{remove_button if i == len(history["visible"]) - 1 else ""}'
-                f'{branch_button}'
-                f'{info_message}'
-                f'</div>'
-                f'{get_version_navigation_html(history, i)}')
-    return (f'<div class="message-actions">'
+        action_buttons = (
             f'{copy_button}'
             f'{edit_button}'
+            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
+            f'{continue_button if i == len(history["visible"]) - 1 else ""}'
+            f'{remove_button if i == len(history["visible"]) - 1 else ""}'
+            f'{branch_button}'
+        )
+
+        version_nav_html = get_version_navigation_html(history, i, "assistant")
+    elif role == "user":
+        action_buttons = (
+            f'{copy_button}'
+            f'{edit_button}'
+        )
+
+        version_nav_html = get_version_navigation_html(history, i, "user")
+
+    return (f'<div class="message-actions">'
+            f'{action_buttons}'
             f'{info_message}'
-            f'</div>')
+            f'</div>'
+            f'{version_nav_html}')
 
 
 def generate_instruct_html(history):
diff --git a/modules/ui.py b/modules/ui.py
index e24e6402..a2662e14 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -212,14 +212,13 @@ def list_interface_input_elements():
         'grammar_string',
         'navigate_message_index',
         'navigate_direction',
+        'navigate_message_role',
         'edit_message_index',
         'edit_message_text',
         'edit_message_role',
         'branch_index',
         'enable_web_search',
         'web_search_pages',
-        'navigate_message_index',
-        'navigate_direction',
     ]
 
     # Chat elements
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 719af85a..df3d3929 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -110,6 +110,7 @@ def create_ui():
         with gr.Row(visible=False):
             shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index")
             shared.gradio['navigate_direction'] = gr.Textbox(value="", elem_id="Navigate-direction")
+            shared.gradio['navigate_message_role'] = gr.Textbox(value="", elem_id="Navigate-message-role")
             shared.gradio['navigate_version'] = gr.Button(elem_id="Navigate-version")
             shared.gradio['edit_message_index'] = gr.Number(value=-1, precision=0, elem_id="Edit-message-index")
             shared.gradio['edit_message_text'] = gr.Textbox(value="", elem_id="Edit-message-text")
@@ -313,8 +314,7 @@ def create_event_handlers():
 
     shared.gradio['edit_message'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False).then(
-        lambda: None, None, None, js='() => { const role = document.getElementById("Edit-message-role").querySelector("textarea").value; if (role === "user") document.getElementById("Regenerate").click(); }')
+        chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
 
     # Save/delete a character
     shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False)

From 3eb0b77427ad7b87c128999fd915f97b22104819 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 18:14:51 -0700
Subject: [PATCH 136/164] Improve the web search query generation

---
 modules/chat.py       | 25 ++++++++++++++++++++++++-
 modules/web_search.py | 29 ++++-------------------------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 8bac680c..495fe934 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -538,6 +538,27 @@ def extract_pdf_text(pdf_path):
         return f"[Error extracting PDF text: {str(e)}]"
 
 
+def generate_search_query(user_message, state):
+    """Generate a search query from user message using the LLM"""
+    # Augment the user message with search instruction
+    augmented_message = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else."
+
+    # Use a minimal state for search query generation but keep the full history
+    search_state = state.copy()
+    search_state['max_new_tokens'] = 64
+    search_state['auto_max_new_tokens'] = False
+    search_state['enable_thinking'] = False
+
+    # Generate the full prompt using existing history + augmented message
+    formatted_prompt = generate_chat_prompt(augmented_message, search_state)
+
+    query = ""
+    for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True):
+        query = reply.strip()
+
+    return query
+
+
 def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False):
     # Handle dict format with text and files
     files = []
@@ -570,7 +591,9 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
             add_message_attachment(output, row_idx, file_path, is_user=True)
 
         # Add web search results as attachments if enabled
-        add_web_search_attachments(output, row_idx, text, state)
+        if state.get('enable_web_search', False):
+            search_query = generate_search_query(text, state)
+            add_web_search_attachments(output, row_idx, text, search_query, state)
 
         # Apply extensions
         text, visible_text = apply_extensions('chat_input', text, visible_text, state)
diff --git a/modules/web_search.py b/modules/web_search.py
index d3387ac9..667178c5 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -13,22 +13,6 @@ def get_current_timestamp():
     return datetime.now().strftime('%b %d, %Y %H:%M')
 
 
-def generate_search_query(user_message, state):
-    """Generate a search query from user message using the LLM"""
-    search_prompt = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else."
-
-    # Use a minimal state for search query generation
-    search_state = state.copy()
-    search_state['max_new_tokens'] = 64
-    search_state['temperature'] = 0.1
-
-    query = ""
-    for reply in generate_reply(search_prompt, search_state, stopping_strings=[], is_chat=False):
-        query = reply.strip()
-
-    return query
-
-
 def download_web_page(url, timeout=10):
     """Download and extract text from a web page"""
     try:
@@ -82,19 +66,14 @@ def perform_web_search(query, num_pages=3):
         return []
 
 
-def add_web_search_attachments(history, row_idx, user_message, state):
+def add_web_search_attachments(history, row_idx, user_message, search_query, state):
     """Perform web search and add results as attachments"""
-    if not state.get('enable_web_search', False):
+    if not search_query:
+        logger.warning("No search query provided")
         return
 
     try:
-        # Generate search query
-        search_query = generate_search_query(user_message, state)
-        if not search_query:
-            logger.warning("Failed to generate search query")
-            return
-
-        logger.info(f"Generated search query: {search_query}")
+        logger.info(f"Using search query: {search_query}")
 
         # Perform web search
         num_pages = int(state.get('web_search_pages', 3))

From 7080a02252b9949297950ef3669361d21f4a6bcf Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 18:15:21 -0700
Subject: [PATCH 137/164] Reduce the timeout for downloading web pages

---
 modules/web_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/web_search.py b/modules/web_search.py
index 667178c5..070f850c 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -13,7 +13,7 @@ def get_current_timestamp():
     return datetime.now().strftime('%b %d, %Y %H:%M')
 
 
-def download_web_page(url, timeout=10):
+def download_web_page(url, timeout=5):
     """Download and extract text from a web page"""
     try:
         headers = {

From 75d6cfd14d1aed5ba19bd747479794cbd34212d0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 28 May 2025 20:34:14 -0700
Subject: [PATCH 138/164] Download fetched web search results in parallel

---
 modules/web_search.py | 44 +++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/modules/web_search.py b/modules/web_search.py
index 070f850c..1f670349 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -1,3 +1,5 @@
+import concurrent.futures
+from concurrent.futures import as_completed
 from datetime import datetime
 
 import requests
@@ -5,7 +7,6 @@ from bs4 import BeautifulSoup
 from duckduckgo_search import DDGS
 
 from modules.logging_colors import logger
-from modules.text_generation import generate_reply
 
 
 def get_current_timestamp():
@@ -40,27 +41,50 @@ def download_web_page(url, timeout=5):
         return f"[Error downloading content from {url}: {str(e)}]"
 
 
-def perform_web_search(query, num_pages=3):
+def perform_web_search(query, num_pages=3, max_workers=5):
     """Perform web search and return results with content"""
     try:
         with DDGS() as ddgs:
             results = list(ddgs.text(query, max_results=num_pages))
 
-        search_results = []
+        # Prepare download tasks
+        download_tasks = []
         for i, result in enumerate(results):
             url = result.get('href', '')
             title = result.get('title', f'Search Result {i+1}')
+            download_tasks.append((url, title, i))
 
-            # Download page content
-            content = download_web_page(url)
+        search_results = [None] * len(download_tasks)  # Pre-allocate to maintain order
 
-            search_results.append({
-                'title': title,
-                'url': url,
-                'content': content
-            })
+        # Download pages in parallel
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all download tasks
+            future_to_task = {
+                executor.submit(download_web_page, task[0]): task
+                for task in download_tasks
+            }
+
+            # Collect results as they complete
+            for future in as_completed(future_to_task):
+                url, title, index = future_to_task[future]
+                try:
+                    content = future.result()
+                    search_results[index] = {
+                        'title': title,
+                        'url': url,
+                        'content': content
+                    }
+                except Exception as e:
+                    logger.error(f"Error downloading {url}: {e}")
+                    # Include failed downloads with empty content
+                    search_results[index] = {
+                        'title': title,
+                        'url': url,
+                        'content': ''
+                    }
 
         return search_results
+
     except Exception as e:
         logger.error(f"Error performing web search: {e}")
         return []

From 63234b9b6f60ec4f276480b4e7f9d4cd1395dcaf Mon Sep 17 00:00:00 2001
From: Underscore <47636331+Th-Underscore@users.noreply.github.com>
Date: Thu, 29 May 2025 07:22:03 -0400
Subject: [PATCH 139/164] UI: Fix impersonate (#7025)

---
 modules/chat.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 495fe934..7afd906d 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -691,16 +691,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     yield output
 
 
-def impersonate_wrapper(text, state):
+def impersonate_wrapper(textbox, state):
+    text = textbox['text']
     static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
     prompt = generate_chat_prompt('', state, impersonate=True)
     stopping_strings = get_stopping_strings(state)
 
-    yield text + '...', static_output
+    textbox['text'] = text + '...'
+    yield textbox, static_output
     reply = None
     for reply in generate_reply(prompt + text, state, stopping_strings=stopping_strings, is_chat=True):
-        yield (text + reply).lstrip(' '), static_output
+        textbox['text'] = (text + reply).lstrip(' ')
+        yield textbox, static_output
         if shared.stop_everything:
             return
 

From a8d02dec8f5e6a054a153b3b09425b51e090ae11 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 04:24:21 -0700
Subject: [PATCH 140/164] Bump llama.cpp

---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 16 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 0eaf10da..5f61aff9 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 65f184bf..a718b6ca 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index d20b2ec3..5fddc623 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 2613d787..8e014445 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index af583b00..77779f3d 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -32,8 +32,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 9bf2a37d..79efc607 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -32,5 +32,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 1731448e..8b29453e 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -32,5 +32,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index fc481a1a..f1f4a02e 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index fdae681d..adf50d9a 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index a58f39f7..46b36791 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 91ea3a6d..66052711 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 37e5aa40..4013abcc 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index dcb2884b..41808854 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 8f1295bb..cff79ec6 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 858b4488..762b3fa3 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 569bae99..b425d305 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From 685cfe254036111711de027f6d3a8198d02e7545 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 04:26:43 -0700
Subject: [PATCH 141/164] Lint

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 181a19b8..8af87b42 100644
--- a/css/main.css
+++ b/css/main.css
@@ -265,7 +265,7 @@ button {
 
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb,
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
-    background: rgba(255, 255, 255, 0.2);
+    background: rgb(255 255 255 / 20%);
     border-radius: 10px;
 }
 

From f2ee917d4f600ebbc5fa9d5fcf65cf5feef27fc1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 04:55:05 -0700
Subject: [PATCH 142/164] Update README

---
 README.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7105ce23..afb21cb0 100644
--- a/README.md
+++ b/README.md
@@ -14,14 +14,17 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
 - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
+- **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents.
+- **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
+- **Advanced chat management**: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point.
+- **Automatic prompt formatting** using Jinja2 templates. You don't need to ever worry about prompt formats.
 - UI that resembles the original ChatGPT style.
-- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
 - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
 - Multiple sampling parameters and generation options for sophisticated text generation control.
 - Switch between different models easily in the UI without restarting, with fine control over settings.
 - OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
-- 100% offline and private, with zero telemetry, external resources, or remote update requests.
+- 100% offline and private, with zero telemetry, external resources, or remote update requests. Web search is optional and user-controlled.
 - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 
 ## How to install

From 2a9699033d90f4ffedfb22cbba7003c6441d08dc Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 04:55:59 -0700
Subject: [PATCH 143/164] Update README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index afb21cb0..05809436 100644
--- a/README.md
+++ b/README.md
@@ -16,8 +16,8 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
 - **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents.
 - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
-- **Advanced chat management**: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point.
-- **Automatic prompt formatting** using Jinja2 templates. You don't need to ever worry about prompt formats.
+- Advanced chat management: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point.
+- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
 - UI that resembles the original ChatGPT style.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
 - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.

From 9a94d7b4f6ae95b6b4b2fc521b5b25c300915dc9 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 05:02:52 -0700
Subject: [PATCH 144/164] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 05809436..900d5fbd 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## Features
 
 - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
-- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
+- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory without affecting your system.
 - **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents.
 - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
 - Advanced chat management: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point.

From 0986d075fb22dc5aa582bbefdfdb0ebdb6ee92c8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 05:03:59 -0700
Subject: [PATCH 145/164] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 900d5fbd..ec01c0aa 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory without affecting your system.
 - **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents.
 - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
-- Advanced chat management: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point.
+- Advanced chat management: Edit messages, navigate between message versions, and branch conversations at any point.
 - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
 - UI that resembles the original ChatGPT style.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.

From 36bc2760058ed4e6998f4c55176c7311b0facabe Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 05:39:26 -0700
Subject: [PATCH 146/164] Update README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index ec01c0aa..9accffb7 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
 - Advanced chat management: Edit messages, navigate between message versions, and branch conversations at any point.
 - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
+- Automatic GPU layers for GGUF models (on NVIDIA GPUs).
 - UI that resembles the original ChatGPT style.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
 - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.

From 81794692ab6fbc0ef24c7484b6571de090984dde Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 08:07:14 -0700
Subject: [PATCH 147/164] UI: Make the dark theme darker

---
 css/main.css | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/css/main.css b/css/main.css
index 8af87b42..0d0a13cf 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1,11 +1,11 @@
 :root {
     --darker-gray: #202123;
-    --dark-gray: #343541;
-    --light-gray: #444654;
+    --dark-gray: #2A2B32;
+    --light-gray: #373943;
     --light-theme-gray: #f9fbff;
     --border-color-dark: #525252;
     --header-width: 112px;
-    --selected-item-color-dark: #32333e;
+    --selected-item-color-dark: #2E2F38;
 }
 
 @font-face {

From c970c5f1665c3966c84ba50a05a45d2598038ea6 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 08:15:13 -0700
Subject: [PATCH 148/164] Make scrollbars darker in dark theme

---
 css/main.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/css/main.css b/css/main.css
index 0d0a13cf..7f9d4618 100644
--- a/css/main.css
+++ b/css/main.css
@@ -265,7 +265,7 @@ button {
 
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb,
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
-    background: rgb(255 255 255 / 20%);
+    background: rgb(255 255 255 / 10%);
     border-radius: 10px;
 }
 

From 3f37a2e915a31b273caddd12a80412a199d753a7 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 08:49:31 -0700
Subject: [PATCH 149/164] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9accffb7..361584f8 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## Features
 
 - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
-- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory without affecting your system.
+- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
 - **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents.
 - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
 - Advanced chat management: Edit messages, navigate between message versions, and branch conversations at any point.

From faa5c82c64e2036762ed3ff60a38fc5b37dac36d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 09:02:34 -0700
Subject: [PATCH 150/164] Fix message version count not updating during
 regeneration streaming

---
 modules/chat.py | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 7afd906d..90d66687 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -617,10 +617,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         if regenerate:
             row_idx = len(output['internal']) - 1
 
-            # Store the first response as a version before regenerating
+            # Store the old response as a version before regenerating
             if not output['metadata'].get(f"assistant_{row_idx}", {}).get('versions'):
                 add_message_version(output, "assistant", row_idx, is_current=False)
 
+            # Add new empty version (will be filled during streaming)
+            key = f"assistant_{row_idx}"
+            output['metadata'][key]["versions"].append({
+                "content": "",
+                "visible_content": "",
+                "timestamp": get_current_timestamp()
+            })
+            output['metadata'][key]["current_version_index"] = len(output['metadata'][key]["versions"]) - 1
+
             if loading_message:
                 yield {
                     'visible': output['visible'][:-1] + [[visible_text, shared.processing_message]],
@@ -673,20 +682,34 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
         if _continue:
             output['internal'][-1] = [text, last_reply[0] + reply]
             output['visible'][-1] = [visible_text, last_reply[1] + visible_reply]
-            if is_stream:
-                yield output
         elif not (j == 0 and visible_reply.strip() == ''):
             output['internal'][-1] = [text, reply.lstrip(' ')]
             output['visible'][-1] = [visible_text, visible_reply.lstrip(' ')]
-            if is_stream:
-                yield output
+
+        # Keep version metadata in sync during streaming (for regeneration)
+        if regenerate:
+            row_idx = len(output['internal']) - 1
+            key = f"assistant_{row_idx}"
+            current_idx = output['metadata'][key]['current_version_index']
+            output['metadata'][key]['versions'][current_idx].update({
+                'content': output['internal'][row_idx][1],
+                'visible_content': output['visible'][row_idx][1]
+            })
+
+        if is_stream:
+            yield output
 
     output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
 
-    # Add the newly generated response as a version (only for regeneration)
+    # Final sync for version metadata (in case streaming was disabled)
     if regenerate:
         row_idx = len(output['internal']) - 1
-        add_message_version(output, "assistant", row_idx, is_current=True)
+        key = f"assistant_{row_idx}"
+        current_idx = output['metadata'][key]['current_version_index']
+        output['metadata'][key]['versions'][current_idx].update({
+            'content': output['internal'][row_idx][1],
+            'visible_content': output['visible'][row_idx][1]
+        })
 
     yield output
 

From 724147ffabce95b5d20528b83b6e44c1523d58f0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 10:49:29 -0700
Subject: [PATCH 151/164] Better detect when no model is available

---
 modules/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/utils.py b/modules/utils.py
index 0e8bdd18..577c55b8 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -74,7 +74,7 @@ def natural_keys(text):
 
 def check_model_loaded():
     if shared.model_name == 'None' or shared.model is None:
-        if len(get_available_models()) <= 1:
+        if len(get_available_models()) == 0:
             error_msg = "No model is loaded.\n\nTo get started:\n1) Place a GGUF file in your user_data/models folder\n2) Go to the Model tab and select it"
             logger.error(error_msg)
             return False, error_msg

From e7129f9dbefbe87fa4c425b5873f80cbddaf7cf0 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 12:45:53 -0700
Subject: [PATCH 152/164] Prevent footer buttons below last assistant message
 from always appearing

---
 js/main.js | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/js/main.js b/js/main.js
index 48bb8632..ea3ff46a 100644
--- a/js/main.js
+++ b/js/main.js
@@ -171,7 +171,6 @@ const observer = new MutationObserver(function(mutations) {
     document.getElementById("Generate").style.display = "flex";
   }
 
-
   doSyntaxHighlighting();
 
   if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) {
@@ -184,7 +183,7 @@ const observer = new MutationObserver(function(mutations) {
     const lastChild = messagesContainer?.lastElementChild;
     const prevSibling = lastChild?.previousElementSibling;
     if (lastChild && prevSibling) {
-      lastChild.style.minHeight = `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px))`;
+      lastChild.style.setProperty("margin-bottom", `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px)`, "important");
     }
   }
 });

From aff41f3482bc7045334b0d81ac514723fdbd4f97 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 12:53:41 -0700
Subject: [PATCH 153/164] Update README

---
 README.md | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 361584f8..daf409d0 100644
--- a/README.md
+++ b/README.md
@@ -189,13 +189,13 @@ usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--
                  [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
                  [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit]
                  [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap]
-                 [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N]
+                 [--mlock] [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N]
                  [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
-                 [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner]
-                 [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
-                 [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
-                 [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--api] [--public-api]
-                 [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
+                 [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
+                 [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
+                 [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
+                 [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY]
+                 [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
 
 Text generation web UI
 
@@ -217,7 +217,7 @@ Basic settings:
   --idle-timeout IDLE_TIMEOUT               Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
 
 Model loader:
-  --loader LOADER                           Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ,
+  --loader LOADER                           Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2,
                                             TensorRT-LLM.
 
 Transformers/Accelerate:
@@ -248,16 +248,18 @@ llama.cpp:
   --batch-size BATCH_SIZE                   Maximum number of prompt tokens to batch together when calling llama_eval.
   --no-mmap                                 Prevent mmap from being used.
   --mlock                                   Force the system to keep the model in RAM.
-  --n-gpu-layers N_GPU_LAYERS               Number of layers to offload to the GPU.
+  --gpu-layers N, --n-gpu-layers N          Number of layers to offload to the GPU.
   --tensor-split TENSOR_SPLIT               Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
   --numa                                    Activate NUMA task allocation for llama.cpp.
   --no-kv-offload                           Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
   --row-split                               Split the model by rows across GPUs. This may improve multi-gpu performance.
-  --extra-flags EXTRA_FLAGS                 Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"
+  --extra-flags EXTRA_FLAGS                 Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"
   --streaming-llm                           Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
 
-Context and cache management:
+Context and cache:
   --ctx-size N, --n_ctx N, --max_seq_len N  Context size in tokens.
+  --cache-type N, --cache_type N            KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits
+                                            separately, e.g. q4_q8).
 
 Speculative decoding:
   --model-draft MODEL_DRAFT                 Path to the draft model for speculative decoding.
@@ -276,15 +278,9 @@ ExLlamaV2:
   --num_experts_per_token N                 Number of experts to use for generation. Applies to MoE models like Mixtral.
   --enable_tp                               Enable Tensor Parallelism (TP) in ExLlamaV2.
 
-HQQ:
-  --hqq-backend HQQ_BACKEND                 Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
-
 TensorRT-LLM:
   --cpp-runner                              Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
 
-Cache:
-  --cache_type CACHE_TYPE                   KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
-
 DeepSpeed:
   --deepspeed                               Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
   --nvme-offload-dir NVME_OFFLOAD_DIR       DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
@@ -307,6 +303,7 @@ Gradio:
   --ssl-certfile SSL_CERTFILE               The path to the SSL certificate cert file.
   --subpath SUBPATH                         Customize the subpath for gradio, use with reverse proxy
   --old-colors                              Use the legacy Gradio colors, before the December/2024 update.
+  --portable                                Hide features not available in portable mode like training.
 
 API:
   --api                                     Enable the API extension.

From f59998d2680f346038320b536617c4738c393947 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 13:08:48 -0700
Subject: [PATCH 154/164] Don't limit the number of prompt characters printed
 with --verbose

---
 modules/text_generation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/text_generation.py b/modules/text_generation.py
index 962311df..1fd6d810 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -505,11 +505,11 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
         return
 
 
-def print_prompt(prompt, max_chars=2000):
+def print_prompt(prompt, max_chars=-1):
     DARK_YELLOW = "\033[38;5;3m"
     RESET = "\033[0m"
 
-    if len(prompt) > max_chars:
+    if max_chars > 0 and len(prompt) > max_chars:
         half_chars = max_chars // 2
         hidden_len = len(prompt[half_chars:-half_chars])
         hidden_msg = f"{DARK_YELLOW}[...{hidden_len} characters hidden...]{RESET}"

From a45a65213052dad02d696ed54af1b9f2ea82cd4a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 13:28:51 -0700
Subject: [PATCH 155/164] CSS fix

---
 js/main.js | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/js/main.js b/js/main.js
index ea3ff46a..f23dc246 100644
--- a/js/main.js
+++ b/js/main.js
@@ -183,7 +183,10 @@ const observer = new MutationObserver(function(mutations) {
     const lastChild = messagesContainer?.lastElementChild;
     const prevSibling = lastChild?.previousElementSibling;
     if (lastChild && prevSibling) {
-      lastChild.style.setProperty("margin-bottom", `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px)`, "important");
+      lastChild.style.setProperty("margin-bottom",
+        `max(0px, calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px))`,
+        "important"
+      );
     }
   }
 });

From 8078c41ec67b96656d7e96128d915290b319e4f5 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 13:32:19 -0700
Subject: [PATCH 156/164] Revert "Bump llama.cpp"

This reverts commit a8d02dec8f5e6a054a153b3b09425b51e090ae11.
---
 requirements/full/requirements.txt                     | 4 ++--
 requirements/full/requirements_amd.txt                 | 4 ++--
 requirements/full/requirements_amd_noavx2.txt          | 4 ++--
 requirements/full/requirements_apple_intel.txt         | 4 ++--
 requirements/full/requirements_apple_silicon.txt       | 6 +++---
 requirements/full/requirements_cpu_only.txt            | 4 ++--
 requirements/full/requirements_cpu_only_noavx2.txt     | 4 ++--
 requirements/full/requirements_noavx2.txt              | 4 ++--
 requirements/portable/requirements.txt                 | 4 ++--
 requirements/portable/requirements_apple_intel.txt     | 4 ++--
 requirements/portable/requirements_apple_silicon.txt   | 6 +++---
 requirements/portable/requirements_cpu_only.txt        | 4 ++--
 requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++--
 requirements/portable/requirements_noavx2.txt          | 4 ++--
 requirements/portable/requirements_vulkan.txt          | 4 ++--
 requirements/portable/requirements_vulkan_noavx2.txt   | 4 ++--
 16 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 5f61aff9..0eaf10da 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index a718b6ca..65f184bf 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index 5fddc623..d20b2ec3 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 8e014445..2613d787 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -32,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 77779f3d..af583b00 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -32,8 +32,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 79efc607..9bf2a37d 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -32,5 +32,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 8b29453e..1731448e 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -32,5 +32,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index f1f4a02e..fc481a1a 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -33,8 +33,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index adf50d9a..fdae681d 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 46b36791..a58f39f7 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 66052711..91ea3a6d 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -18,6 +18,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 4013abcc..37e5aa40 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index 41808854..dcb2884b 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index cff79ec6..8f1295bb 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 762b3fa3..858b4488 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index b425d305..569bae99 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -18,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

From dce02732a4caef16157ffbc288dfe079053e0bb4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 14:08:48 -0700
Subject: [PATCH 157/164] Fix timestamp issues when editing/swiping messages

---
 modules/chat.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modules/chat.py b/modules/chat.py
index 90d66687..6b3ff4fc 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1508,11 +1508,12 @@ def handle_edit_message_click(state):
     if "versions" not in history['metadata'][key] or not history['metadata'][key]["versions"]:
         original_content = history['internal'][message_index][role_idx]
         original_visible = history['visible'][message_index][role_idx]
+        original_timestamp = history['metadata'][key].get('timestamp', get_current_timestamp())
 
         history['metadata'][key]["versions"] = [{
             "content": original_content,
             "visible_content": original_visible,
-            "timestamp": get_current_timestamp()
+            "timestamp": original_timestamp
         }]
 
     history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True)
@@ -1564,6 +1565,7 @@ def handle_navigate_version_click(state):
     history['internal'][message_index][msg_content_idx] = version_to_load['content']
     history['visible'][message_index][msg_content_idx] = version_to_load['visible_content']
     metadata['current_version_index'] = new_idx
+    update_message_metadata(history['metadata'], role, message_index, timestamp=version_to_load['timestamp'])
 
     # Redraw and save
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

From acbcc12e7b19cc9f540d32b8d601ceefde77b7a1 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 14:11:21 -0700
Subject: [PATCH 158/164] Clean up

---
 modules/chat.py    | 7 ++-----
 modules/ui_chat.py | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index 6b3ff4fc..e526a9a0 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -1493,7 +1493,7 @@ def handle_edit_message_click(state):
 
     if message_index >= len(history['internal']):
         html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
-        return [history, html_output, gr.update()]  # No unique_id change
+        return [history, html_output]
 
     role_idx = 0 if role == "user" else 1
 
@@ -1521,13 +1521,10 @@ def handle_edit_message_click(state):
 
     add_message_version(history, role, message_index, is_current=True)
 
-    # Since we are not branching, unique_id does not change.
-    past_chats_update = gr.update()
-
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
     html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
-    return [history, html_output, past_chats_update]
+    return [history, html_output]
 
 
 def handle_navigate_version_click(state):
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index df3d3929..d79aa523 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -314,7 +314,7 @@ def create_event_handlers():
 
     shared.gradio['edit_message'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
+        chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False)
 
     # Save/delete a character
     shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False)

From d1bfb08e8d4bab174e6b4467eff20f8a01a2a613 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 14:27:47 -0700
Subject: [PATCH 159/164] Improve the style of message editing

---
 css/main.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/css/main.css b/css/main.css
index 7f9d4618..9685c863 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1462,6 +1462,7 @@ strong {
 .editing-textarea {
     width: 100%;
     min-height: 200px;
+    max-height: 65vh;
     padding: 10px;
     border-radius: 5px;
     border: 1px solid #ccc;

From 28e6bd4fcd8cd385cc92cc56c0c49fc474006147 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 29 May 2025 14:49:07 -0700
Subject: [PATCH 160/164] Revert "Update transformers requirement in
 /requirements/full (#7017)"

This reverts commit cc9b7253c1216e5340da85cba9b65a13cf3526e9.
---
 requirements/full/requirements.txt                 | 2 +-
 requirements/full/requirements_amd.txt             | 2 +-
 requirements/full/requirements_amd_noavx2.txt      | 2 +-
 requirements/full/requirements_apple_intel.txt     | 2 +-
 requirements/full/requirements_apple_silicon.txt   | 2 +-
 requirements/full/requirements_cpu_only.txt        | 2 +-
 requirements/full/requirements_cpu_only_noavx2.txt | 2 +-
 requirements/full/requirements_noavx2.txt          | 2 +-
 requirements/full/requirements_nowheels.txt        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 0eaf10da..2c322715 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -23,7 +23,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 65f184bf..6aeb325e 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index d20b2ec3..3b052423 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 2613d787..8c51459e 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index af583b00..b9f15d45 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 9bf2a37d..0877d968 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 1731448e..cab78237 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index fc481a1a..dfd42577 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -23,7 +23,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 2ed8affa..5d9f84ce 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -22,7 +22,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.52.*
+transformers==4.50.*
 tqdm
 wandb
 

From 7c29879e795776ceb742a8ddb47fd3843069cf34 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 30 May 2025 11:17:47 -0700
Subject: [PATCH 161/164] Fix 'Start reply with' (closes #7033)

---
 modules/chat.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/modules/chat.py b/modules/chat.py
index e526a9a0..881f7330 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -806,9 +806,12 @@ def remove_last_message(history):
     return html.unescape(last[0]), history
 
 
-def send_dummy_message(textbox, state):
+def send_dummy_message(text, state):
     history = state['history']
-    text = textbox['text']
+
+    # Handle both dict and string inputs
+    if isinstance(text, dict):
+        text = text['text']
 
     # Initialize metadata if not present
     if 'metadata' not in history:
@@ -822,9 +825,12 @@ def send_dummy_message(textbox, state):
     return history
 
 
-def send_dummy_reply(textbox, state):
+def send_dummy_reply(text, state):
     history = state['history']
-    text = textbox['text']
+
+    # Handle both dict and string inputs
+    if isinstance(text, dict):
+        text = text['text']
 
     # Initialize metadata if not present
     if 'metadata' not in history:

From 298d4719c6c9545a701a9cc9e8f4efceb108599a Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 30 May 2025 11:32:24 -0700
Subject: [PATCH 162/164] Multiple small style improvements

---
 css/main.css  | 4 ++++
 modules/ui.py | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/css/main.css b/css/main.css
index 9685c863..967d94ed 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1551,3 +1551,7 @@ strong {
     color: var(--body-text-color-subdued);
     margin-top: 4px;
 }
+
+button:focus {
+    outline: none;
+}
diff --git a/modules/ui.py b/modules/ui.py
index a2662e14..9f4d67cb 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -71,6 +71,7 @@ if not shared.args.old_colors:
         block_background_fill_dark='transparent',
         block_border_color_dark='transparent',
         input_border_color_dark='var(--border-color-dark)',
+        input_border_color_focus_dark='var(--border-color-dark)',
         checkbox_border_color_dark='var(--border-color-dark)',
         border_color_primary_dark='var(--border-color-dark)',
         button_secondary_border_color_dark='var(--border-color-dark)',
@@ -89,6 +90,8 @@ if not shared.args.old_colors:
         checkbox_label_shadow='none',
         block_shadow='none',
         block_shadow_dark='none',
+        input_shadow_focus='none',
+        input_shadow_focus_dark='none',
         button_large_radius='0.375rem',
         button_large_padding='6px 12px',
         input_radius='0.375rem',

From 219f0a773166deeb0326c2874b29e66e382df524 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 30 May 2025 12:05:49 -0700
Subject: [PATCH 163/164] Fix exllamav3_hf models failing to unload (closes
 #7031)

---
 modules/exllamav3_hf.py | 17 +++++++++++++++++
 modules/models.py       |  3 +++
 2 files changed, 20 insertions(+)

diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index 417df473..1254ff5d 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -245,3 +245,20 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
         pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
 
         return Exllamav3HF(pretrained_model_name_or_path)
+
+    def unload(self):
+        """Properly unload the ExllamaV3 model and free GPU memory."""
+        if hasattr(self, 'ex_model') and self.ex_model is not None:
+            self.ex_model.unload()
+            self.ex_model = None
+
+        if hasattr(self, 'ex_cache') and self.ex_cache is not None:
+            self.ex_cache = None
+
+        # Clean up any additional ExllamaV3 resources
+        if hasattr(self, 'past_seq'):
+            self.past_seq = None
+        if hasattr(self, 'past_seq_negative'):
+            self.past_seq_negative = None
+        if hasattr(self, 'ex_cache_negative'):
+            self.ex_cache_negative = None
diff --git a/modules/models.py b/modules/models.py
index 4218d58c..d329ae3c 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -116,10 +116,13 @@ def unload_model(keep_model_name=False):
         return
 
     is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
+    if shared.args.loader == 'ExLlamav3_HF':
+        shared.model.unload()
 
     shared.model = shared.tokenizer = None
     shared.lora_names = []
     shared.model_dirty_from_training = False
+
     if not is_llamacpp:
         from modules.torch_utils import clear_torch_cache
         clear_torch_cache()

From 15f466ca3f8255f2566f016db8d7b8fd9ebef3f4 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Fri, 30 May 2025 15:49:57 -0700
Subject: [PATCH 164/164] Update README

---
 README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index daf409d0..55df33d2 100644
--- a/README.md
+++ b/README.md
@@ -14,18 +14,18 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
 - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
-- **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents.
-- **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation.
-- Advanced chat management: Edit messages, navigate between message versions, and branch conversations at any point.
+- 100% offline and private, with zero telemetry, external resources, or remote update requests.
 - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
-- Automatic GPU layers for GGUF models (on NVIDIA GPUs).
-- UI that resembles the original ChatGPT style.
-- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
-- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
+- **File attachments**: Upload text files and PDF documents to talk about their contents.
+- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
+- Aesthetic UI with dark and light themes.
+- `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters.
+- Edit messages, navigate between message versions, and branch conversations at any point.
 - Multiple sampling parameters and generation options for sophisticated text generation control.
-- Switch between different models easily in the UI without restarting, with fine control over settings.
+- Switch between different models in the UI without restarting.
+- Automatic GPU layers for GGUF models (on NVIDIA GPUs).
+- Free-form text generation in the Default/Notebook tabs without being limited to chat turns.
 - OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
-- 100% offline and private, with zero telemetry, external resources, or remote update requests. Web search is optional and user-controlled.
 - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 
 ## How to install