From c12a53c998ce39ec762b9f7895861f1d94c2d827 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 1 May 2025 19:46:56 -0700 Subject: [PATCH 001/164] Use turboderp's exllamav2 wheels --- requirements/full/requirements.txt | 6 +++--- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_noavx2.txt | 6 +++--- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 6f265eba..c0ace41b 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -34,8 +34,8 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_c https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index c8e75ee7..91582eb3 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -30,5 +30,5 @@ tiktoken # AMD wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index e54d6d9c..7b86050e 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -30,5 +30,5 @@ tiktoken # AMD wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index d714ea3d..cc747edb 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -32,4 +32,4 @@ tiktoken https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 89f4f576..67b3260e 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -33,4 +33,4 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_c https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index e216c9cd..3575d352 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -34,8 +34,8 @@ https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_c https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" From f8aaf3c23a793b60ce7452213304acb493be98af Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 1 May 2025 19:50:46 -0700 Subject: [PATCH 002/164] Use ROCm 6.2.4 on AMD --- README.md | 2 +- one_click.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 4b541b9e..3280186c 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ conda activate textgen |--------|---------|---------| | Linux/WSL | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` | | Linux/WSL | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu` | -| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.1` | +| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.2.4` | | MacOS + MPS | Any | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` | | Windows | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` | | Windows | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` | diff --git a/one_click.py b/one_click.py index 065afd99..cb16b813 100644 --- a/one_click.py +++ b/one_click.py @@ -222,7 +222,7 @@ def update_pytorch_and_python(): if "+cu" in torver: install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124" elif "+rocm" in torver: - install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1" + install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4" elif "+cpu" in torver: install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu" elif "+cxx11" in torver: @@ -273,7 +273,7 @@ def install_webui(): "What is your GPU?", { 'A': 'NVIDIA - CUDA 12.4', - 'B': 'AMD - Linux/macOS only, requires ROCm 6.1', + 'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4', 'C': 'Apple M Series', 'D': 'Intel Arc (beta)', 'N': 'CPU mode' @@ -314,7 +314,7 @@ def install_webui(): if selected_gpu == "NVIDIA": install_pytorch += "--index-url https://download.pytorch.org/whl/cu124" elif selected_gpu == "AMD": - install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.1" + install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.2.4" elif selected_gpu in ["APPLE", "NONE"]: install_pytorch += "--index-url https://download.pytorch.org/whl/cpu" elif selected_gpu == "INTEL": From d5c407cf35453ba2d06eea942942ff11cdc7993b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 1 May 2025 20:05:36 -0700 Subject: [PATCH 003/164] Use Vulkan instead of ROCm for llama.cpp on AMD --- requirements/full/requirements_amd.txt | 3 ++- requirements/full/requirements_amd_noavx2.txt | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 91582eb3..24eeee6a 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -29,6 +29,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 7b86050e..99716f3c 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -29,6 +29,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" From 9e3867dc8358baf153d6f7c182496dad158696a4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 2 May 2025 09:36:15 -0700 Subject: [PATCH 004/164] llama.cpp: Fix manual random seeds --- modules/text_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/text_generation.py b/modules/text_generation.py index 8d091868..b9bf9b16 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -480,7 +480,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N For models that do not use the transformers library for sampling """ - seed = set_manual_seed(state['seed']) + state['seed'] = set_manual_seed(state['seed']) t0 = time.time() reply = '' try: From 3f26b0408bd02f500acc8c090a7e50ee286051b5 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 2 May 2025 16:17:22 -0700 Subject: [PATCH 005/164] Fix after 9e3867dc8358baf153d6f7c182496dad158696a4 --- modules/text_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/text_generation.py b/modules/text_generation.py index b9bf9b16..8fd65dc4 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -500,7 +500,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N t1 = time.time() original_tokens = len(encode(original_question)[0]) new_tokens = len(encode(original_question + reply)[0]) - original_tokens - print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})') + print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {state["seed"]})') return From 905afced1c8339833280de254cd597b389a3dade Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 2 May 2025 16:32:22 -0700 Subject: [PATCH 006/164] Add a --portable flag to hide things in portable mode --- modules/presets.py | 9 ++++++++- modules/shared.py | 1 + modules/ui_model_menu.py | 17 +++++++++++------ modules/ui_parameters.py | 2 +- server.py | 5 +++-- start_linux.sh | 2 +- start_macos.sh | 2 +- start_windows.bat | 2 +- 8 files changed, 27 insertions(+), 13 deletions(-) diff --git a/modules/presets.py b/modules/presets.py index a432bf52..50d0f985 100644 --- a/modules/presets.py +++ b/modules/presets.py @@ -11,7 +11,7 @@ from modules.logging_colors import logger def default_preset(): - return { + result = { 'temperature': 1, 'dynatemp_low': 1, 'dynatemp_high': 1, @@ -50,6 +50,13 @@ def default_preset(): 'dry_sequence_breakers': '"\\n", ":", "\\"", "*"', } + if shared.args.portable: + samplers = result['sampler_priority'].split('\n') + samplers = [sampler for sampler in samplers if sampler in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature", "repetition_penalty"]] + result['sampler_priority'] = '\n'.join(samplers) + + return result + def presets_params(): return [k for k in default_preset()] diff --git a/modules/shared.py b/modules/shared.py index fb10c014..39b0bdaa 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -190,6 +190,7 @@ group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certific group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None) group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy') group.add_argument('--old-colors', action='store_true', help='Use the legacy Gradio colors, before the December/2024 update.') +group.add_argument('--portable', action='store_true', help='Hide features not available in portable mode like training.') # API group = parser.add_argument_group('API') diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index d13bcff7..4a49d209 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -35,14 +35,17 @@ def create_ui(): shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu) with gr.Column(): - with gr.Row(): - shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu) - ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu) - shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu) + if shared.args.portable: + pass + else: + with gr.Row(): + shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu) + ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu) + shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu) with gr.Row(): with gr.Column(): - shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None) + shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None) with gr.Blocks(): with gr.Row(): with gr.Column(): @@ -150,7 +153,9 @@ def create_event_handlers(): ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False) - shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False) + if not shared.args.portable: + shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False) + shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True) shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True) shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model')) diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index 3f609d71..071b30b6 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -21,7 +21,7 @@ def create_ui(default_preset): shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button') with gr.Column(): - shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown') + shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()) if not shared.args.portable else ['llama.cpp'], value="All", elem_classes='slim-dropdown') with gr.Row(): with gr.Column(): diff --git a/server.py b/server.py index 169578a5..b0b9e633 100644 --- a/server.py +++ b/server.py @@ -90,7 +90,7 @@ def create_interface(): 'instruction_template_str': shared.settings['instruction_template_str'], 'prompt_menu-default': shared.settings['prompt-default'], 'prompt_menu-notebook': shared.settings['prompt-notebook'], - 'filter_by_loader': shared.args.loader or 'All' + 'filter_by_loader': (shared.args.loader or 'All') if not shared.args.portable else 'llama.cpp' }) if Path("user_data/cache/pfp_character.png").exists(): @@ -127,7 +127,8 @@ def create_interface(): ui_parameters.create_ui(shared.settings['preset']) # Parameters tab ui_model_menu.create_ui() # Model tab - training.create_ui() # Training tab + if not shared.args.portable: + training.create_ui() # Training tab ui_session.create_ui() # Session tab # Generation events diff --git a/start_linux.sh b/start_linux.sh index 00082f07..c74f1272 100755 --- a/start_linux.sh +++ b/start_linux.sh @@ -4,7 +4,7 @@ cd "$(dirname "${BASH_SOURCE[0]}")" # Portable install case if [ -d "portable_env" ]; then - ./portable_env/bin/python3 server.py --api --auto-launch "$@" + ./portable_env/bin/python3 server.py --portable --api --auto-launch "$@" exit $? fi diff --git a/start_macos.sh b/start_macos.sh index 628f59cc..7a060ba6 100755 --- a/start_macos.sh +++ b/start_macos.sh @@ -4,7 +4,7 @@ cd "$(dirname "${BASH_SOURCE[0]}")" # Portable install case if [ -d "portable_env" ]; then - ./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@" + ./portable_env/bin/python3 server.py --portable --api --auto-launch --api-port 5005 "$@" exit $? fi diff --git a/start_windows.bat b/start_windows.bat index 451b85e0..1616ee27 100755 --- a/start_windows.bat +++ b/start_windows.bat @@ -5,7 +5,7 @@ cd /D "%~dp0" @rem Portable install case if exist "portable_env" ( - .\portable_env\python.exe server.py --api --auto-launch %* + .\portable_env\python.exe server.py --portable --api --auto-launch %* exit /b %errorlevel% ) From 4cea720da8cca27cbb5e8ac560019a55e6afb73a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 2 May 2025 16:38:28 -0700 Subject: [PATCH 007/164] UI: Remove the "Autoload the model" feature --- modules/shared.py | 1 - modules/ui_model_menu.py | 9 ++------- user_data/settings-template.yaml | 1 - 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/modules/shared.py b/modules/shared.py index 39b0bdaa..cfedb992 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -60,7 +60,6 @@ settings = { 'custom_stopping_strings': '', 'custom_token_bans': '', 'negative_prompt': '', - 'autoload_model': False, 'dark_theme': True, 'default_extensions': [], 'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}", diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 4a49d209..9361ef91 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -30,7 +30,7 @@ def create_ui(): with gr.Row(): shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu) ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) - shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button', interactive=not mu) + shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu) shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu) shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu) @@ -108,9 +108,6 @@ def create_ui(): shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.') with gr.Column(): - with gr.Row(): - shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu) - with gr.Tab("Download"): shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu) shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1, interactive=not mu) @@ -135,11 +132,10 @@ def create_event_handlers(): # In this event handler, the interface state is read and updated # with the model defaults (if any), and then the model is loaded - # unless "autoload_model" is unchecked shared.gradio['model_menu'].change( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then( - load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=True).success( + partial(load_model_wrapper, autoload=False), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success( handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False) shared.gradio['load_model'].click( @@ -158,7 +154,6 @@ def create_event_handlers(): shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True) shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True) - shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model')) shared.gradio['customized_template_submit'].click(save_instruction_template, gradio('model_menu', 'customized_template'), gradio('model_status'), show_progress=True) diff --git a/user_data/settings-template.yaml b/user_data/settings-template.yaml index 20896da3..ce0f77e1 100644 --- a/user_data/settings-template.yaml +++ b/user_data/settings-template.yaml @@ -31,7 +31,6 @@ seed: -1 custom_stopping_strings: '' custom_token_bans: '' negative_prompt: '' -autoload_model: false dark_theme: true default_extensions: [] instruction_template_str: |- From 3526b7923c9f5a3b3ba55056e445a660a03d2bc6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 2 May 2025 17:40:53 -0700 Subject: [PATCH 008/164] Remove extensions with requirements from portable builds --- .github/workflows/build-portable-release-cuda.yml | 2 ++ .github/workflows/build-portable-release-vulkan.yml | 2 ++ .github/workflows/build-portable-release.yml | 2 ++ 3 files changed, 6 insertions(+) diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml index fb9e61b0..571cbac0 100644 --- a/.github/workflows/build-portable-release-cuda.yml +++ b/.github/workflows/build-portable-release-cuda.yml @@ -102,6 +102,8 @@ jobs: shell: bash run: | rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker + allowed=("character_bias" "gallery" "openai" "sd_api_pictures") + find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf # Define common variables CUDA_VERSION="${{ matrix.cuda }}" diff --git a/.github/workflows/build-portable-release-vulkan.yml b/.github/workflows/build-portable-release-vulkan.yml index 8de29791..4e88d4d9 100644 --- a/.github/workflows/build-portable-release-vulkan.yml +++ b/.github/workflows/build-portable-release-vulkan.yml @@ -101,6 +101,8 @@ jobs: shell: bash run: | rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker + allowed=("character_bias" "gallery" "openai" "sd_api_pictures") + find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf # Define common variables AVX_SUPPORT="${{ matrix.avx }}" diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml index bdf96cec..6910ce2c 100644 --- a/.github/workflows/build-portable-release.yml +++ b/.github/workflows/build-portable-release.yml @@ -101,6 +101,8 @@ jobs: shell: bash run: | rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker + allowed=("character_bias" "gallery" "openai" "sd_api_pictures") + find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf # Define common variables AVX_SUPPORT="${{ matrix.avx }}" From d08acb4af9c2a4f4d0f7fd97babb217c0890e1c8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 2 May 2025 20:50:52 -0700 Subject: [PATCH 009/164] UI: Rename enable_thinking -> Enable thinking --- modules/ui_parameters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index 071b30b6..733d0901 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -82,7 +82,7 @@ def create_ui(default_preset): shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.') shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.') shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.') - shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='enable_thinking', info='Used by Qwen3 to toggle mode.') + shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Qwen3 to toggle mode.') shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.') shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming') shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.') From b21bd8bb1e79466be945abfe417e92e52b63ec6f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 2 May 2025 22:41:49 -0700 Subject: [PATCH 010/164] UI: Invert user/assistant message colors in instruct mode The goal is to make assistant messages more readable. --- css/html_instruct_style.css | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index 4613b380..b98544a1 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -61,11 +61,11 @@ } .dark .chat .user-message { - background: transparent; + background: var(--light-gray); } .dark .chat .assistant-message { - background: var(--light-gray); + background: transparent; } .chat .user-message .text, From b71ef50e9d01c15a09c67b95e2032fed535c63ba Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 2 May 2025 23:45:58 -0700 Subject: [PATCH 011/164] UI: Add a min-height to prevent constant scrolling during chat streaming --- css/chat_style-Dark.css | 2 ++ css/chat_style-TheEncrypted777.css | 2 ++ css/chat_style-cai-chat.css | 1 + css/main.css | 5 +++++ modules/html_generator.py | 9 ++++++--- 5 files changed, 16 insertions(+), 3 deletions(-) diff --git a/css/chat_style-Dark.css b/css/chat_style-Dark.css index 368a2a16..3b2bd385 100644 --- a/css/chat_style-Dark.css +++ b/css/chat_style-Dark.css @@ -1,5 +1,6 @@ .message { display: grid; + align-items: start; grid-template-columns: 60px minmax(0, 1fr); padding-bottom: 28px; font-size: 18px; @@ -102,6 +103,7 @@ @media screen and (width <= 688px) { .message { display: grid; + align-items: start; grid-template-columns: 60px minmax(0, 1fr); padding-bottom: 25px; font-size: 15px; diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css index 6404f41d..25d26db8 100644 --- a/css/chat_style-TheEncrypted777.css +++ b/css/chat_style-TheEncrypted777.css @@ -2,6 +2,7 @@ .message { display: grid; + align-items: start; grid-template-columns: 60px minmax(0, 1fr); padding-bottom: 28px; font-size: 18px; @@ -100,6 +101,7 @@ @media screen and (width <= 688px) { .message { display: grid; + align-items: start; grid-template-columns: 60px minmax(0, 1fr); padding-bottom: 25px; font-size: 15px; diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css index 93276bd3..223f6150 100644 --- a/css/chat_style-cai-chat.css +++ b/css/chat_style-cai-chat.css @@ -1,5 +1,6 @@ .message { display: grid; + align-items: start; grid-template-columns: 60px minmax(0, 1fr); padding-bottom: 2em; font-size: 15px; diff --git a/css/main.css b/css/main.css index d6e5ac83..cf0dfde7 100644 --- a/css/main.css +++ b/css/main.css @@ -403,6 +403,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { .chat-parent { height: calc(100dvh - 98px - var(--input-delta)); overflow: auto !important; + /* scroll-behavior: smooth; */ border-radius: 0 !important; margin-bottom: var(--input-delta) !important; } @@ -1382,3 +1383,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { 50% { opacity: 1; } 100% { opacity: 0.6; } } + +.streaming { + min-height: 70vh; +} diff --git a/modules/html_generator.py b/modules/html_generator.py index 67d15b6e..a6f5f930 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -365,8 +365,9 @@ def generate_instruct_html(history): f'' ) + streaming_class = " streaming" if i == len(history["visible"]) - 1 else "" output += ( - f'
' f'
' f'
{converted_visible[1]}
' @@ -414,8 +415,9 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache= f'
' ) + streaming_class = " streaming" if i == len(history["visible"]) - 1 else "" output += ( - f'
' f'
{img_bot}
' f'
' @@ -452,8 +454,9 @@ def generate_chat_html(history, name1, name2, reset_cache=False): f'
' ) + streaming_class = " streaming" if i == len(history["visible"]) - 1 else "" output += ( - f'
' f'
' f'
{converted_visible[1]}
' From ea60f14674a89d3a71e5504edacb8f64f148b57c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 3 May 2025 06:06:50 -0700 Subject: [PATCH 012/164] UI: Show the list of files if the user tries to download a GGUF repository --- modules/ui_model_menu.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 9361ef91..2c593df6 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -192,6 +192,26 @@ def load_lora_wrapper(selected_loras): def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False): try: + # Handle direct GGUF URLs + if repo_id.startswith("https://") and ("huggingface.co" in repo_id) and (repo_id.endswith(".gguf") or repo_id.endswith(".gguf?download=true")): + try: + path = repo_id.split("huggingface.co/")[1] + + # Extract the repository ID (first two parts of the path) + parts = path.split("/") + if len(parts) >= 2: + extracted_repo_id = f"{parts[0]}/{parts[1]}" + + # Extract the filename (last part of the path) + filename = repo_id.split("/")[-1] + if "?download=true" in filename: + filename = filename.replace("?download=true", "") + + repo_id = extracted_repo_id + specific_file = filename + except: + pass + if repo_id == "": yield ("Please enter a model path") return @@ -205,6 +225,18 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur yield ("Getting the download links from Hugging Face") links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file) + + # Check for multiple GGUF files + gguf_files = [link for link in links if link.lower().endswith('.gguf')] + if len(gguf_files) > 1 and not specific_file: + output = "Multiple GGUF files found. Please copy one of the following filenames to the 'File name' field:\n\n```\n" + for link in gguf_files: + output += f"{Path(link).name}\n" + + output += "```" + yield output + return + if return_links: output = "```\n" for link in links: From 4c2e3b168bc1751dbb3f1b222fdd749ad7a5d36e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 3 May 2025 06:51:20 -0700 Subject: [PATCH 013/164] llama.cpp: Add a retry mechanism when getting the logits (sometimes it fails) --- modules/llama_cpp_server.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index d9187db8..2ebeb560 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -210,14 +210,15 @@ class LlamaServer: pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) print() - response = self.session.post(url, json=payload) - result = response.json() + for retry in range(5): + response = self.session.post(url, json=payload) + result = response.json() - if "completion_probabilities" in result: - if use_samplers: - return result["completion_probabilities"][0]["top_probs"] - else: - return result["completion_probabilities"][0]["top_logprobs"] + if "completion_probabilities" in result: + if use_samplers: + return result["completion_probabilities"][0]["top_probs"] + else: + return result["completion_probabilities"][0]["top_logprobs"] else: raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}") From 5f5569e9ac21ffdcb335b0557909ad102104fc8f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 May 2025 06:20:24 -0700 Subject: [PATCH 014/164] Update README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3280186c..8a7b2467 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again. -You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`. +You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, or `update_wizard_macos.sh`.
@@ -55,7 +55,7 @@ Setup details and information about installing manually The script uses Miniconda to set up a Conda environment in the `installer_files` folder. -If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, `cmd_macos.sh`, or `cmd_wsl.bat`. +If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, or `cmd_macos.sh`. * There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root. * To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts. From b7a5c7db8de89159144fadb59920045efc3fe544 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 May 2025 07:14:42 -0700 Subject: [PATCH 015/164] llama.cpp: Handle short arguments in --extra-flags --- modules/llama_cpp_server.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 2ebeb560..7244001a 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -317,9 +317,15 @@ class LlamaServer: for flag_item in extra_flags.split(','): if '=' in flag_item: flag, value = flag_item.split('=', 1) - cmd += [f"--{flag}", value] + if len(flag) <= 3: + cmd += [f"-{flag}", value] + else: + cmd += [f"--{flag}", value] else: - cmd.append(f"--{flag_item}") + if len(flag_item) <= 3: + cmd.append(f"-{flag_item}") + else: + cmd.append(f"--{flag_item}") env = os.environ.copy() if os.name == 'posix': From 7853fb1c8d701bb8b720b3907bdc50017911d6a6 Mon Sep 17 00:00:00 2001 From: oobabooga Date: Sun, 4 May 2025 18:58:37 -0300 Subject: [PATCH 016/164] Optimize the Chat tab (#6948) --- css/main.css | 34 +++++++++++----------------------- js/main.js | 8 +------- modules/ui_chat.py | 2 +- 3 files changed, 13 insertions(+), 31 deletions(-) diff --git a/css/main.css b/css/main.css index cf0dfde7..64e96ccc 100644 --- a/css/main.css +++ b/css/main.css @@ -389,7 +389,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { .chat { margin-left: auto; margin-right: auto; - min-height: var(--chat-height); + flex: 1; overflow-y: auto; display: flex; flex-direction: column; @@ -401,11 +401,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } .chat-parent { - height: calc(100dvh - 98px - var(--input-delta)); + flex: 1; overflow: auto !important; - /* scroll-behavior: smooth; */ border-radius: 0 !important; - margin-bottom: var(--input-delta) !important; } .chat-parent .prose { @@ -422,8 +420,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } .chat-parent.bigchat { - height: calc(100dvh - 98px - var(--input-delta)) !important; - margin-bottom: var(--input-delta) !important; + flex: 1; } .chat > .messages { @@ -604,8 +601,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } .chat-input-positioned { - position: absolute; - bottom: 0; max-width: 54rem; left: 50%; transform: translateX(-50%); @@ -790,7 +785,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } #chat-input-container { - min-width: 0 !important; + display: flex; + flex-direction: column; } #chat-input-container > .form { @@ -799,9 +795,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } #chat-input-row { - padding-bottom: 1.5em; - padding-left: 1rem; - padding-right: 1rem; + padding: 1rem; } #chat-input-row.bigchat { @@ -809,22 +803,16 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } #chat-col { - padding-bottom: 100px; + height: 100dvh; + display: flex; + flex-direction: column; + padding-bottom: 0; } @media screen and (width <= 924px) { #chat-col { - padding-bottom: 100px; + height: calc(100dvh - 132px); margin-top: 32px; - position: relative; /* Ensure positioning for the pseudo-element */ - } - - .chat-parent { - height: calc(100dvh - 98px - var(--input-delta) - 32px); - } - - .chat-parent.bigchat { - height: calc(100dvh - 98px - var(--input-delta) - 32px) !important; } } diff --git a/js/main.js b/js/main.js index 33b7d6bd..408815db 100644 --- a/js/main.js +++ b/js/main.js @@ -442,12 +442,6 @@ function updateCssProperties() { // Check if the chat container is visible if (chatContainer.clientHeight > 0) { - const chatContainerParentHeight = chatContainer.parentNode.clientHeight; - const newChatHeight = `${chatContainerParentHeight - chatInputHeight - 80}px`; - - document.documentElement.style.setProperty("--chat-height", newChatHeight); - document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`); - // Adjust scrollTop based on input height change if (chatInputHeight !== currentChatInputHeight) { const deltaHeight = chatInputHeight - currentChatInputHeight; @@ -720,7 +714,7 @@ function isMobile() { // Function to initialize sidebars function initializeSidebars() { const isOnMobile = isMobile(); - + if (isOnMobile) { // Mobile state: Hide sidebars and set closed states [pastChatsRow, chatControlsRow, headerBar].forEach(el => { diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 0d588549..0856cfab 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -46,8 +46,8 @@ def create_ui(): with gr.Row(): with gr.Column(elem_id='chat-col'): - shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True) shared.gradio['display'] = gr.JSON(value={}, visible=False) # Hidden buffer + shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True) with gr.Row(elem_id="chat-input-row"): with gr.Column(scale=1, elem_id='gr-hover-container'): gr.HTML(value='
', elem_id='gr-hover') From d1866219261c5fa1e9d8c0a9c6c380b965ca7cc7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 May 2025 15:19:46 -0700 Subject: [PATCH 017/164] UI: Fixes after previous commit --- css/main.css | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/css/main.css b/css/main.css index 64e96ccc..f76a2787 100644 --- a/css/main.css +++ b/css/main.css @@ -787,6 +787,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { #chat-input-container { display: flex; flex-direction: column; + min-width: 0 !important; } #chat-input-container > .form { @@ -807,12 +808,13 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { display: flex; flex-direction: column; padding-bottom: 0; + gap: 0; } @media screen and (width <= 924px) { #chat-col { - height: calc(100dvh - 132px); margin-top: 32px; + height: calc(100dvh - 32px); } } From 84ab1f95bedd2433cec165502375901fdaa56a98 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 May 2025 15:21:52 -0700 Subject: [PATCH 018/164] UI: Increase the chat area a bit --- css/main.css | 1 + 1 file changed, 1 insertion(+) diff --git a/css/main.css b/css/main.css index f76a2787..d5d5e771 100644 --- a/css/main.css +++ b/css/main.css @@ -797,6 +797,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { #chat-input-row { padding: 1rem; + padding-top: 0; } #chat-input-row.bigchat { From d9da16edba88b09dcbfe97a7be302c65ed244ebb Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 May 2025 16:53:52 -0700 Subject: [PATCH 019/164] UI: Remove the chat input textarea border --- css/main.css | 1 + 1 file changed, 1 insertion(+) diff --git a/css/main.css b/css/main.css index d5d5e771..b3e699fa 100644 --- a/css/main.css +++ b/css/main.css @@ -581,6 +581,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { #chat-input textarea { padding: 0.65rem 2.5rem; + border: 0; } #chat-input textarea::placeholder { From 690d693913f68d25d08fd74db902495766c12e5e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 May 2025 18:04:52 -0700 Subject: [PATCH 020/164] UI: Add padding to only show the last message/reply after sending a message To avoid scrolling --- css/chat_style-Dark.css | 3 ++- css/chat_style-TheEncrypted777.css | 3 ++- css/chat_style-cai-chat-square.css | 3 ++- css/chat_style-cai-chat.css | 3 ++- css/chat_style-messenger.css | 3 ++- css/chat_style-wpp.css | 3 ++- css/html_instruct_style.css | 4 ---- css/main.css | 4 ---- js/main.js | 10 ++++++++++ modules/html_generator.py | 9 +++------ 10 files changed, 25 insertions(+), 20 deletions(-) diff --git a/css/chat_style-Dark.css b/css/chat_style-Dark.css index 3b2bd385..1ad46bc0 100644 --- a/css/chat_style-Dark.css +++ b/css/chat_style-Dark.css @@ -2,7 +2,8 @@ display: grid; align-items: start; grid-template-columns: 60px minmax(0, 1fr); - padding-bottom: 28px; + padding-bottom: 14px; + padding-top: 14px; font-size: 18px; font-family: Roboto, Arial, sans-serif; /* Modern font */ line-height: 1.5; diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css index 25d26db8..9e1230b7 100644 --- a/css/chat_style-TheEncrypted777.css +++ b/css/chat_style-TheEncrypted777.css @@ -4,7 +4,8 @@ display: grid; align-items: start; grid-template-columns: 60px minmax(0, 1fr); - padding-bottom: 28px; + padding-bottom: 14px; + padding-top: 14px; font-size: 18px; font-family: 'Noto Sans', Arial, sans-serif; line-height: 1.428571429; diff --git a/css/chat_style-cai-chat-square.css b/css/chat_style-cai-chat-square.css index 854fff60..015f6927 100644 --- a/css/chat_style-cai-chat-square.css +++ b/css/chat_style-cai-chat-square.css @@ -16,6 +16,7 @@ } .message { - padding-bottom: 2em; + padding-bottom: 1em; + padding-top: 1em; grid-template-columns: 70px minmax(0, 1fr); } diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css index 223f6150..0e91101f 100644 --- a/css/chat_style-cai-chat.css +++ b/css/chat_style-cai-chat.css @@ -2,7 +2,8 @@ display: grid; align-items: start; grid-template-columns: 60px minmax(0, 1fr); - padding-bottom: 2em; + padding-bottom: 1em; + padding-top: 1em; font-size: 15px; font-family: 'Noto Sans', Helvetica, Arial, sans-serif; line-height: 22.5px !important; diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css index f0fd1578..6518d6ca 100644 --- a/css/chat_style-messenger.css +++ b/css/chat_style-messenger.css @@ -1,5 +1,6 @@ .message { - padding-bottom: 25px; + padding-bottom: 12.5px; + padding-top: 12.5px; font-size: 15px; font-family: 'Noto Sans', Helvetica, Arial, sans-serif; line-height: 1.428571429; diff --git a/css/chat_style-wpp.css b/css/chat_style-wpp.css index 30ca61f3..1442dd0a 100644 --- a/css/chat_style-wpp.css +++ b/css/chat_style-wpp.css @@ -1,5 +1,6 @@ .message { - padding-bottom: 25px; + padding-bottom: 12.5px; + padding-top: 12.5px; font-size: 15px; font-family: 'Noto Sans', Helvetica, Arial, sans-serif; line-height: 1.428571429; diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index b98544a1..f4339311 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -8,10 +8,6 @@ padding-top: 0 !important; } -.chat > .messages > :last-child { - margin-bottom: 1.7rem !important; -} - .chat .message-body p, .chat .message-body li { font-size: 1rem !important; line-height: 28px !important; diff --git a/css/main.css b/css/main.css index b3e699fa..9915735d 100644 --- a/css/main.css +++ b/css/main.css @@ -1375,7 +1375,3 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { 50% { opacity: 1; } 100% { opacity: 0.6; } } - -.streaming { - min-height: 70vh; -} diff --git a/js/main.js b/js/main.js index 408815db..e6611788 100644 --- a/js/main.js +++ b/js/main.js @@ -150,6 +150,16 @@ const observer = new MutationObserver(function(mutations) { if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) { targetElement.scrollTop = targetElement.scrollHeight; } + + const chatElement = document.getElementById("chat"); + if (chatElement) { + const messagesContainer = chatElement.querySelector(".messages"); + const lastChild = messagesContainer?.lastElementChild; + const prevSibling = lastChild?.previousElementSibling; + if (lastChild && prevSibling) { + lastChild.style.minHeight = `calc(100vh - ${prevSibling.offsetHeight}px - 102px)`; + } + } }); // Configure the observer to watch for changes in the subtree and attributes diff --git a/modules/html_generator.py b/modules/html_generator.py index a6f5f930..67d15b6e 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -365,9 +365,8 @@ def generate_instruct_html(history): f'
' ) - streaming_class = " streaming" if i == len(history["visible"]) - 1 else "" output += ( - f'
' f'
' f'
{converted_visible[1]}
' @@ -415,9 +414,8 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache= f'
' ) - streaming_class = " streaming" if i == len(history["visible"]) - 1 else "" output += ( - f'
' f'
{img_bot}
' f'
' @@ -454,9 +452,8 @@ def generate_chat_html(history, name1, name2, reset_cache=False): f'
' ) - streaming_class = " streaming" if i == len(history["visible"]) - 1 else "" output += ( - f'
' f'
' f'
{converted_visible[1]}
' From 2da197bba4b1547d51086e312d877d942e810be2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 May 2025 18:29:05 -0700 Subject: [PATCH 021/164] Refinement after previous commit --- js/main.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/main.js b/js/main.js index e6611788..205cf88e 100644 --- a/js/main.js +++ b/js/main.js @@ -157,7 +157,7 @@ const observer = new MutationObserver(function(mutations) { const lastChild = messagesContainer?.lastElementChild; const prevSibling = lastChild?.previousElementSibling; if (lastChild && prevSibling) { - lastChild.style.minHeight = `calc(100vh - ${prevSibling.offsetHeight}px - 102px)`; + lastChild.style.minHeight = `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px))`; } } }); From d0211afb3c513bde0d8662bd686ddb0dc87354cd Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 May 2025 18:52:01 -0700 Subject: [PATCH 022/164] Save the chat history right after sending a message --- modules/chat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/chat.py b/modules/chat.py index 98913d5c..feac6bdd 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -483,6 +483,8 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False): history = state['history'] for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)): yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history + if i == 0: + save_history(history, state['unique_id'], state['character_menu'], state['mode']) save_history(history, state['unique_id'], state['character_menu'], state['mode']) From df7bb0db1fe6478d037debf73272b10cef1f75c7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 May 2025 20:03:55 -0700 Subject: [PATCH 023/164] Rename --n-gpu-layers to --gpu-layers --- modules/shared.py | 2 +- modules/ui_model_menu.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/shared.py b/modules/shared.py index cfedb992..b952c4a1 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -120,7 +120,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.') group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.') group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') -group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.') +group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=0, metavar='N', help='Number of layers to offload to the GPU.') group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.') group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.') group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 2c593df6..943645cf 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -49,7 +49,7 @@ def create_ui(): with gr.Blocks(): with gr.Row(): with gr.Column(): - shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') + shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads) shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size) From f3da45f65d76f8c48fd95678ecc841afb0ddd04e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 4 May 2025 20:37:15 -0700 Subject: [PATCH 024/164] ExLlamaV3_HF: Change max_chunk_size to 256 --- modules/exllamav3_hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py index 12b22f64..417df473 100644 --- a/modules/exllamav3_hf.py +++ b/modules/exllamav3_hf.py @@ -119,7 +119,7 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): reset = True # Maximum number of tokens to process in a single forward pass - max_chunk_size = 2048 + max_chunk_size = 256 # Make the forward call if labels is None: From b817bb33fd7b26a24c81798dabb36af4620d4a53 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 May 2025 04:54:25 -0700 Subject: [PATCH 025/164] Minor fix after df7bb0db1fe6478d037debf73272b10cef1f75c7 --- modules/llama_cpp_server.py | 2 +- modules/ui_model_menu.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 7244001a..0ddb3fff 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -256,7 +256,7 @@ class LlamaServer: self.server_path, "--model", self.model_path, "--ctx-size", str(shared.args.ctx_size), - "--n-gpu-layers", str(shared.args.n_gpu_layers), + "--gpu-layers", str(shared.args.gpu_layers), "--batch-size", str(shared.args.batch_size), "--port", str(self.port), ] diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 943645cf..e05d2256 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -49,7 +49,7 @@ def create_ui(): with gr.Blocks(): with gr.Row(): with gr.Column(): - shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') + shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads) shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size) From 475e012ee8e0cbeb53fade01359cd649b9b5d470 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 May 2025 06:16:11 -0700 Subject: [PATCH 026/164] UI: Improve the light theme colors --- css/main.css | 3 ++- modules/ui.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/css/main.css b/css/main.css index 9915735d..b1d6e345 100644 --- a/css/main.css +++ b/css/main.css @@ -2,7 +2,7 @@ --darker-gray: #202123; --dark-gray: #343541; --light-gray: #444654; - --light-theme-gray: #f5f5f5; + --light-theme-gray: #f3f4f6; --border-color-dark: #525252; --header-width: 112px; --selected-item-color-dark: #32333e; @@ -580,6 +580,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } #chat-input textarea { + background: var(--light-theme-gray); padding: 0.65rem 2.5rem; border: 0; } diff --git a/modules/ui.py b/modules/ui.py index fb016f87..d08c1435 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -61,7 +61,7 @@ if not shared.args.old_colors: background_fill_primary_dark='var(--darker-gray)', body_background_fill="white", block_background_fill="transparent", - body_text_color="#333", + body_text_color='rgb(64, 64, 64)', button_secondary_background_fill="#f4f4f4", button_secondary_border_color="var(--border-color-primary)", From 6001d279c64d92c1f2a312142e41119807694729 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 May 2025 07:42:13 -0700 Subject: [PATCH 027/164] Light theme improvement --- css/main.css | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/css/main.css b/css/main.css index b1d6e345..38585a1c 100644 --- a/css/main.css +++ b/css/main.css @@ -979,6 +979,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { cursor: pointer; } +#past-chats .selected, +#past-chats label:hover { + background-color: rgb(224, 224, 224) !important; +} + #past-chats-buttons, #delete-chat-row, #rename-row { @@ -987,7 +992,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { gap: 9px; } - #past-chats-row, #chat-controls { width: 260px; From 967b70327ea10a9c5cc7c932583993687b9d4ba7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 May 2025 07:56:47 -0700 Subject: [PATCH 028/164] Light theme improvement --- css/html_instruct_style.css | 2 +- css/main.css | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index f4339311..fb984338 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -42,7 +42,7 @@ } .chat .user-message { - background: #f5f5f5; + background: #f3f4f6; padding: 1.5rem 1rem; padding-bottom: 2rem; border-radius: 0; diff --git a/css/main.css b/css/main.css index 38585a1c..d6a0d220 100644 --- a/css/main.css +++ b/css/main.css @@ -2,7 +2,7 @@ --darker-gray: #202123; --dark-gray: #343541; --light-gray: #444654; - --light-theme-gray: #f3f4f6; + --light-theme-gray: #f9fbff; --border-color-dark: #525252; --header-width: 112px; --selected-item-color-dark: #32333e; @@ -580,7 +580,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } #chat-input textarea { - background: var(--light-theme-gray); + background: #f3f4f6; padding: 0.65rem 2.5rem; border: 0; } @@ -981,7 +981,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { #past-chats .selected, #past-chats label:hover { - background-color: rgb(224, 224, 224) !important; + background-color: #dbeafe !important; } #past-chats-buttons, @@ -1123,8 +1123,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { border: 0 !important; } -.dark #past-chats .selected, -.dark #past-chats label:hover { +.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats .selected, +.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats label:hover { background-color: var(--selected-item-color-dark) !important; } @@ -1161,7 +1161,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } .header_bar button.selected { - background: #E0E0E0; + background: #dbeafe; } #chat-controls, From bf5290bc0ff15f6894a4eb5785e8df60831ecb25 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 May 2025 08:04:12 -0700 Subject: [PATCH 029/164] Fix the hover menu in light theme --- css/main.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/css/main.css b/css/main.css index d6a0d220..59165a62 100644 --- a/css/main.css +++ b/css/main.css @@ -742,7 +742,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { .hover-menu button { width: 100%; - background: transparent !important; + background: white !important; border-radius: 0 !important; justify-content: space-between; margin: 0 !important; From 53d8e4650202f5891364197011098b3af34fe6ac Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 May 2025 12:28:17 -0700 Subject: [PATCH 030/164] Ensure environment isolation in portable installs --- start_linux.sh | 9 +++++---- start_macos.sh | 9 +++++---- start_windows.bat | 9 +++++---- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/start_linux.sh b/start_linux.sh index c74f1272..e2b00558 100755 --- a/start_linux.sh +++ b/start_linux.sh @@ -1,5 +1,10 @@ #!/usr/bin/env bash +# environment isolation +export PYTHONNOUSERSITE=1 +unset PYTHONPATH +unset PYTHONHOME + cd "$(dirname "${BASH_SOURCE[0]}")" # Portable install case @@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then exit fi -# environment isolation -export PYTHONNOUSERSITE=1 -unset PYTHONPATH -unset PYTHONHOME export CUDA_PATH="$INSTALL_ENV_DIR" export CUDA_HOME="$CUDA_PATH" diff --git a/start_macos.sh b/start_macos.sh index 7a060ba6..bff11bc1 100755 --- a/start_macos.sh +++ b/start_macos.sh @@ -1,5 +1,10 @@ #!/bin/bash +# environment isolation +export PYTHONNOUSERSITE=1 +unset PYTHONPATH +unset PYTHONHOME + cd "$(dirname "${BASH_SOURCE[0]}")" # Portable install case @@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then exit fi -# environment isolation -export PYTHONNOUSERSITE=1 -unset PYTHONPATH -unset PYTHONHOME export CUDA_PATH="$INSTALL_ENV_DIR" export CUDA_HOME="$CUDA_PATH" diff --git a/start_windows.bat b/start_windows.bat index 1616ee27..f5e66ec2 100755 --- a/start_windows.bat +++ b/start_windows.bat @@ -1,6 +1,11 @@ @echo off setlocal enabledelayedexpansion +@rem environment isolation +set PYTHONNOUSERSITE=1 +set PYTHONPATH= +set PYTHONHOME= + cd /D "%~dp0" @rem Portable install case @@ -87,10 +92,6 @@ if not exist "%INSTALL_ENV_DIR%" ( @rem check if conda environment was actually created if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end ) -@rem environment isolation -set PYTHONNOUSERSITE=1 -set PYTHONPATH= -set PYTHONHOME= set "CUDA_PATH=%INSTALL_ENV_DIR%" set "CUDA_HOME=%CUDA_PATH%" From 8137eb8ef46ac6950cb96094e3cc30b0a72dee76 Mon Sep 17 00:00:00 2001 From: mamei16 Date: Mon, 5 May 2025 23:05:23 +0200 Subject: [PATCH 031/164] Dynamic Chat Message UI Update Speed (#6952) --- modules/shared.py | 1 - modules/text_generation.py | 18 ++++++++---------- modules/ui.py | 1 - modules/ui_parameters.py | 2 -- user_data/settings-template.yaml | 1 - 5 files changed, 8 insertions(+), 15 deletions(-) diff --git a/modules/shared.py b/modules/shared.py index b952c4a1..b4dfbfd1 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -47,7 +47,6 @@ settings = { 'max_new_tokens_max': 4096, 'prompt_lookup_num_tokens': 0, 'max_tokens_second': 0, - 'max_updates_second': 12, 'auto_max_new_tokens': True, 'ban_eos_token': False, 'add_bos_token': True, diff --git a/modules/text_generation.py b/modules/text_generation.py index 8fd65dc4..7e48a2f6 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -64,41 +64,39 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap all_stop_strings += st shared.stop_everything = False - last_update = -1 reply = '' is_stream = state['stream'] if len(all_stop_strings) > 0 and not state['stream']: state = copy.deepcopy(state) state['stream'] = True - min_update_interval = 0 - if state.get('max_updates_second', 0) > 0: - min_update_interval = 1 / state['max_updates_second'] - # Generate + last_update = -1 + latency_threshold = 1 / 1000 for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat): + cur_time = time.monotonic() reply, stop_found = apply_stopping_strings(reply, all_stop_strings) if escape_html: reply = html.escape(reply) if is_stream: - cur_time = time.time() - # Limit number of tokens/second to make text readable in real time if state['max_tokens_second'] > 0: diff = 1 / state['max_tokens_second'] - (cur_time - last_update) if diff > 0: time.sleep(diff) - last_update = time.time() + last_update = time.monotonic() yield reply # Limit updates to avoid lag in the Gradio UI # API updates are not limited else: - if cur_time - last_update > min_update_interval: - last_update = cur_time + # If 'generate_func' takes less than 0.001 seconds to yield the next token + # (equivalent to more than 1000 tok/s), assume that the UI is lagging behind and skip yielding + if (cur_time - last_update) > latency_threshold: yield reply + last_update = time.monotonic() if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything): break diff --git a/modules/ui.py b/modules/ui.py index d08c1435..b3d4bccf 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -192,7 +192,6 @@ def list_interface_input_elements(): 'max_new_tokens', 'prompt_lookup_num_tokens', 'max_tokens_second', - 'max_updates_second', 'do_sample', 'dynamic_temperature', 'temperature_last', diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index 733d0901..84f9fbfc 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -71,8 +71,6 @@ def create_ui(default_preset): shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label='max_new_tokens', info='⚠️ Setting this too high can cause prompt truncation.') shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.') shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.') - shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.') - with gr.Column(): with gr.Row(): with gr.Column(): diff --git a/user_data/settings-template.yaml b/user_data/settings-template.yaml index ce0f77e1..db481e84 100644 --- a/user_data/settings-template.yaml +++ b/user_data/settings-template.yaml @@ -18,7 +18,6 @@ max_new_tokens_min: 1 max_new_tokens_max: 4096 prompt_lookup_num_tokens: 0 max_tokens_second: 0 -max_updates_second: 12 auto_max_new_tokens: true ban_eos_token: false add_bos_token: true From 85bf2e15b98117ef5630e81bf4a002440fffe2c2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 May 2025 14:14:48 -0700 Subject: [PATCH 032/164] API: Remove obsolete multimodal extension handling Multimodal support will be added back once it's implemented in llama-server. --- extensions/openai/completions.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 75e2cc11..46c76199 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -96,30 +96,6 @@ def convert_history(history): user_input_last = True system_message = "" - # Multimodal: convert OpenAI format to multimodal extension format - if any('content' in entry and isinstance(entry['content'], list) for entry in history): - new_history = [] - for entry in history: - if isinstance(entry['content'], list): - for item in entry['content']: - if not isinstance(item, dict): - continue - - image_url = None - content = None - if item['type'] == 'image_url' and isinstance(item['image_url'], dict): - image_url = item['image_url']['url'] - elif item['type'] == 'text' and isinstance(item['text'], str): - content = item['text'] - if image_url: - new_history.append({"image_url": image_url, "role": "user"}) - if content: - new_history.append({"content": content, "role": "user"}) - else: - new_history.append(entry) - - history = new_history - for entry in history: if "image_url" in entry: image_url = entry['image_url'] From f82667f0b4c0824420a6637efee3c680ddbe25f3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 May 2025 14:17:00 -0700 Subject: [PATCH 033/164] Remove more multimodal extension references --- extensions/openai/completions.py | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 46c76199..a7d8b4e4 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -1,13 +1,8 @@ -import base64 import copy -import re import time from collections import deque -from io import BytesIO -import requests import tiktoken -from PIL import Image from extensions.openai.errors import InvalidRequestError from extensions.openai.utils import debug_msg @@ -97,28 +92,7 @@ def convert_history(history): system_message = "" for entry in history: - if "image_url" in entry: - image_url = entry['image_url'] - if "base64" in image_url: - image_url = re.sub('^data:image/.+;base64,', '', image_url) - img = Image.open(BytesIO(base64.b64decode(image_url))) - else: - try: - my_res = requests.get(image_url) - img = Image.open(BytesIO(my_res.content)) - except Exception: - raise 'Image cannot be loaded from the URL!' - - buffered = BytesIO() - if img.mode in ("RGBA", "P"): - img = img.convert("RGB") - - img.save(buffered, format="JPEG") - img_str = base64.b64encode(buffered.getvalue()).decode('utf-8') - content = f'' - else: - content = entry["content"] - + content = entry["content"] role = entry["role"] if role == "user": From 941e0663da48345150ae77d7c6b6eb54e21d671d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 May 2025 14:18:05 -0700 Subject: [PATCH 034/164] Update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8a7b2467..6cc84c50 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ If you ever need to install something manually in the `installer_files` environm * There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root. * To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts. * For additional instructions about AMD and WSL setup, consult [the documentation](https://github.com/oobabooga/text-generation-webui/wiki). -* For automated installation, you can use the `GPU_CHOICE`, `USE_CUDA118`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`. +* For automated installation, you can use the `GPU_CHOICE`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`. ### Manual installation using Conda From 987505ead345b0e113d636311f6a5faa4fcbe986 Mon Sep 17 00:00:00 2001 From: Evgenii Novikov Date: Tue, 6 May 2025 00:03:33 +0200 Subject: [PATCH 035/164] docker: Fix app uid typo in cpu docker compose (#6957) --- docker/cpu/docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/cpu/docker-compose.yml b/docker/cpu/docker-compose.yml index c9d415ae..9aba314a 100644 --- a/docker/cpu/docker-compose.yml +++ b/docker/cpu/docker-compose.yml @@ -22,7 +22,7 @@ services: TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-} APP_GID: ${APP_GID:-6972} - APP_UID: ${APP_UID-6972} + APP_UID: ${APP_UID:-6972} env_file: .env user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}" ports: From 99bd66445f90df58a3f3832b35cca94dc397d1be Mon Sep 17 00:00:00 2001 From: Alireza Ghasemi Date: Tue, 6 May 2025 00:04:06 +0200 Subject: [PATCH 036/164] SuperboogaV2: minor update to avoid json serialization errors #6945 --- extensions/superboogav2/chromadb.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/extensions/superboogav2/chromadb.py b/extensions/superboogav2/chromadb.py index 6e93dd92..f4f77821 100644 --- a/extensions/superboogav2/chromadb.py +++ b/extensions/superboogav2/chromadb.py @@ -292,6 +292,8 @@ class ChromaCollector(): for doc in documents: doc_tokens = encode(doc)[0] + if isinstance(doc_tokens, np.ndarray): + doc_tokens = doc_tokens.tolist() doc_token_count = len(doc_tokens) if current_token_count + doc_token_count > max_token_count: # If adding this document would exceed the max token count, From 76f947e3cf1c71e4105f708f02b2ca163a69987c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 May 2025 15:58:29 -0700 Subject: [PATCH 037/164] UI: Minor style change --- css/main.css | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/css/main.css b/css/main.css index 59165a62..520ff972 100644 --- a/css/main.css +++ b/css/main.css @@ -1380,3 +1380,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { 50% { opacity: 1; } 100% { opacity: 0.6; } } + +strong { + font-weight: bold; +} From 530223bf0b196257e41ec948c2e92e1c3e507e9f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 May 2025 16:00:49 -0700 Subject: [PATCH 038/164] UI: Fix the hover menu colors --- css/main.css | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/css/main.css b/css/main.css index 520ff972..b8ba8256 100644 --- a/css/main.css +++ b/css/main.css @@ -761,6 +761,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { background: var(--button-secondary-background-fill-hover) !important; } +.dark .hover-menu button:hover { + background: var(--selected-item-color-dark) !important; +} + .transparent-substring { opacity: 0.333; } @@ -1109,12 +1113,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { color: #9ca3af; } -.dark .hover-menu { - background-color: var(--darker-gray); -} - .dark .hover-menu button { border-color: var(--border-color-primary); + background-color: var(--darker-gray) !important; } .dark #chat-controls, From 4e8f628d3c206e8362cea5b5f7557abe33351bc0 Mon Sep 17 00:00:00 2001 From: Evgenii Novikov Date: Tue, 6 May 2025 01:05:15 +0200 Subject: [PATCH 039/164] docker: App uid typo in other docker composes (#6958) --- docker/amd/docker-compose.yml | 2 +- docker/intel/docker-compose.yml | 2 +- docker/nvidia/docker-compose.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/amd/docker-compose.yml b/docker/amd/docker-compose.yml index 4709ae94..8866e9ed 100644 --- a/docker/amd/docker-compose.yml +++ b/docker/amd/docker-compose.yml @@ -22,7 +22,7 @@ services: TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-} APP_GID: ${APP_GID:-6972} - APP_UID: ${APP_UID-6972} + APP_UID: ${APP_UID:-6972} env_file: .env user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}" ports: diff --git a/docker/intel/docker-compose.yml b/docker/intel/docker-compose.yml index 31e9dde0..78e06698 100644 --- a/docker/intel/docker-compose.yml +++ b/docker/intel/docker-compose.yml @@ -22,7 +22,7 @@ services: TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-} APP_GID: ${APP_GID:-6972} - APP_UID: ${APP_UID-6972} + APP_UID: ${APP_UID:-6972} env_file: .env user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}" ports: diff --git a/docker/nvidia/docker-compose.yml b/docker/nvidia/docker-compose.yml index 835dd838..0392078e 100644 --- a/docker/nvidia/docker-compose.yml +++ b/docker/nvidia/docker-compose.yml @@ -22,7 +22,7 @@ services: TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-} APP_GID: ${APP_GID:-6972} - APP_UID: ${APP_UID-6972} + APP_UID: ${APP_UID:-6972} env_file: .env user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}" ports: From cbef35054cb598b033e17b0442e8dad2da6873c4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 5 May 2025 17:46:09 -0700 Subject: [PATCH 040/164] UI: CSS fix --- css/main.css | 1 + 1 file changed, 1 insertion(+) diff --git a/css/main.css b/css/main.css index b8ba8256..746f1f9e 100644 --- a/css/main.css +++ b/css/main.css @@ -426,6 +426,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { .chat > .messages { display: flex; flex-direction: column; + min-height: calc(100vh - 102px); } .chat > .messages > :first-child { From d1c0154d664e51d1ee6ea82d9c0e799d96367d4a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 May 2025 06:38:39 -0700 Subject: [PATCH 041/164] llama.cpp: Add top_n_sigma, fix typical_p in sampler priority --- modules/llama_cpp_server.py | 5 ++++- modules/presets.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 0ddb3fff..b9902cd7 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -66,6 +66,7 @@ class LlamaServer: "top_k": state["top_k"], "top_p": state["top_p"], "min_p": state["min_p"], + "top_n_sigma": state["top_n_sigma"] if state["top_n_sigma"] > 0 else -1, "tfs_z": state["tfs"], "typical_p": state["typical_p"], "repeat_penalty": state["repetition_penalty"], @@ -102,8 +103,10 @@ class LlamaServer: penalty_found = False for s in samplers: - if s.strip() in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]: + if s.strip() in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc"]: filtered_samplers.append(s.strip()) + elif s.strip() == "typical_p": + filtered_samplers.append("typ_p") elif not penalty_found and s.strip() == "repetition_penalty": filtered_samplers.append("penalties") penalty_found = True diff --git a/modules/presets.py b/modules/presets.py index 50d0f985..5a9a5873 100644 --- a/modules/presets.py +++ b/modules/presets.py @@ -52,7 +52,7 @@ def default_preset(): if shared.args.portable: samplers = result['sampler_priority'].split('\n') - samplers = [sampler for sampler in samplers if sampler in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature", "repetition_penalty"]] + samplers = [sampler for sampler in samplers if sampler in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc", "typical_p", "repetition_penalty"]] result['sampler_priority'] = '\n'.join(samplers) return result From 89590adc14c941814c2d54795cfc78fab959d9e7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 May 2025 06:41:17 -0700 Subject: [PATCH 042/164] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- .../full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- .../full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_amd.txt | 18 ------------------ .../portable/requirements_amd_noavx2.txt | 18 ------------------ .../portable/requirements_apple_intel.txt | 4 ++-- .../portable/requirements_apple_silicon.txt | 6 +++--- .../portable/requirements_cpu_only.txt | 4 ++-- .../portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- .../portable/requirements_vulkan_noavx2.txt | 4 ++-- 18 files changed, 34 insertions(+), 70 deletions(-) delete mode 100644 requirements/portable/requirements_amd.txt delete mode 100644 requirements/portable/requirements_amd_noavx2.txt diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index c0ace41b..a60ea7b4 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 24eeee6a..431cd740 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 99716f3c..0c581f86 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index cc747edb..f7213efe 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 67b3260e..4aac3dea 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -29,8 +29,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 47ad5759..ac277d61 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 334f11df..cc412d33 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 3575d352..78265f1a 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index c720daa7..1240d335 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt deleted file mode 100644 index 7d9c00c0..00000000 --- a/requirements/portable/requirements_amd.txt +++ /dev/null @@ -1,18 +0,0 @@ -fastapi==0.112.4 -gradio==4.37.* -jinja2==3.1.6 -markdown -numpy==1.26.* -pydantic==2.8.2 -pyyaml -requests -rich -tqdm - -# API -flask_cloudflared==0.0.14 -sse-starlette==1.6.5 -tiktoken - -# AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt deleted file mode 100644 index d718c1b1..00000000 --- a/requirements/portable/requirements_amd_noavx2.txt +++ /dev/null @@ -1,18 +0,0 @@ -fastapi==0.112.4 -gradio==4.37.* -jinja2==3.1.6 -markdown -numpy==1.26.* -pydantic==2.8.2 -pyyaml -requests -rich -tqdm - -# API -flask_cloudflared==0.0.14 -sse-starlette==1.6.5 -tiktoken - -# AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 9e184b53..6b165b7c 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index ec059716..1b2b5cf2 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -15,6 +15,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index d473b824..2793d743 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index d3fffb43..6d7316a6 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index cdfa6a01..e56eba08 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 1a7ce6ed..a7f8c703 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 4737321d..5b427fd2 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 605cc9ab14533dd20cc11363f020fb9947cfb723 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 May 2025 06:42:15 -0700 Subject: [PATCH 043/164] Update exllamav3 --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_noavx2.txt | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index a60ea7b4..3b50c674 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -32,8 +32,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index f7213efe..ba23ea9c 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -31,5 +31,5 @@ tiktoken # Mac wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 4aac3dea..c245ab74 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -32,5 +32,5 @@ tiktoken https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 78265f1a..d8bbf6d1 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -32,8 +32,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" From 1927afe89457dce8eb805b2275ba7c8a9680a967 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 May 2025 08:18:49 -0700 Subject: [PATCH 044/164] Fix top_n_sigma not showing for llama.cpp --- modules/loaders.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/loaders.py b/modules/loaders.py index 738198b1..217d569c 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -300,6 +300,7 @@ loaders_samplers = { 'xtc_threshold', 'xtc_probability', 'tfs', + 'top_n_sigma', 'dry_multiplier', 'dry_allowed_length', 'dry_base', From 05115e42ee1ab7a2848b883e469885ce9504f04a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 May 2025 08:27:21 -0700 Subject: [PATCH 045/164] Set top_n_sigma before temperature by default --- modules/presets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/presets.py b/modules/presets.py index 5a9a5873..cf706605 100644 --- a/modules/presets.py +++ b/modules/presets.py @@ -46,7 +46,7 @@ def default_preset(): 'do_sample': True, 'dynamic_temperature': False, 'temperature_last': False, - 'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_n_sigma\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram', + 'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntop_n_sigma\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram', 'dry_sequence_breakers': '"\\n", ":", "\\"", "*"', } From c4f36db0d859e1819550e576a7fbd513c990c64d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 May 2025 08:41:13 -0700 Subject: [PATCH 046/164] llama.cpp: remove tfs (it doesn't get used) --- modules/llama_cpp_server.py | 1 - modules/loaders.py | 1 - 2 files changed, 2 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index b9902cd7..d8d2f61b 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -67,7 +67,6 @@ class LlamaServer: "top_p": state["top_p"], "min_p": state["min_p"], "top_n_sigma": state["top_n_sigma"] if state["top_n_sigma"] > 0 else -1, - "tfs_z": state["tfs"], "typical_p": state["typical_p"], "repeat_penalty": state["repetition_penalty"], "repeat_last_n": state["repetition_penalty_range"], diff --git a/modules/loaders.py b/modules/loaders.py index 217d569c..b29679bd 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -299,7 +299,6 @@ loaders_samplers = { 'typical_p', 'xtc_threshold', 'xtc_probability', - 'tfs', 'top_n_sigma', 'dry_multiplier', 'dry_allowed_length', From 5ef564a22e8df21a7480d5c8d6e32919f35f14c7 Mon Sep 17 00:00:00 2001 From: Downtown-Case Date: Tue, 6 May 2025 15:03:33 -0500 Subject: [PATCH 047/164] Fix model config loading in shared.py for Python 3.13 (#6961) --- modules/shared.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/modules/shared.py b/modules/shared.py index b4dfbfd1..6fd4604c 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -310,11 +310,13 @@ if args.api or args.public_api: add_extension('openai', last=True) # Load model-specific settings -with Path(f'{args.model_dir}/config.yaml') as p: - if p.exists(): - model_config = yaml.safe_load(open(p, 'r').read()) - else: - model_config = {} +p = Path(f'{args.model_dir}/config.yaml') +if p.exists(): + model_config = yaml.safe_load(open(p, 'r').read()) +else: + model_config = {} +del p + # Load custom model-specific settings user_config = load_user_config() From e4fb2475d25e1dccfa39f5d943bcde61ef517245 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 May 2025 14:02:01 -0700 Subject: [PATCH 048/164] UI: Multiple small style improvements (light/dark themes) --- css/html_instruct_style.css | 2 +- css/main.css | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index fb984338..6ad250aa 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -66,7 +66,7 @@ .chat .user-message .text, .chat .assistant-message .text { - max-width: 645px; + max-width: 700px; margin-left: auto; margin-right: auto; } diff --git a/css/main.css b/css/main.css index 746f1f9e..30089aca 100644 --- a/css/main.css +++ b/css/main.css @@ -545,7 +545,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { border-radius: 5px; font-size: 82%; padding: 1px 3px; - background: white !important; + background: #f3f4f6 !important; color: #1f2328; } @@ -559,18 +559,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { padding: 15px; } -.message-body :not(pre) > code::before { - content: "`"; -} - -.message-body :not(pre) > code::after { - content: "`"; -} - .message-body :not(pre) > code { white-space: normal !important; font-weight: bold; - font-family: unset; + font-size: 0.95em; + font-family: Menlo,"Roboto Mono","Courier New",Courier,monospace,Inter,sans-serif; + padding: .15rem .3rem; + background-color: #ececec; +} + +.dark .message-body :not(pre) > code { + background-color: rgb(255 255 255 / 12.5%); } #chat-input { @@ -584,6 +583,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { background: #f3f4f6; padding: 0.65rem 2.5rem; border: 0; + box-shadow: 0; } #chat-input textarea::placeholder { @@ -759,7 +759,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } .hover-menu button:hover { - background: var(--button-secondary-background-fill-hover) !important; + background: #dbeafe !important; } .dark .hover-menu button:hover { From b28fa86db6921adc8a42038f7062b72a27cb68b1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 6 May 2025 17:51:55 -0700 Subject: [PATCH 049/164] Default --gpu-layers to 256 --- modules/shared.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/shared.py b/modules/shared.py index 6fd4604c..f2698bd2 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -119,7 +119,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.') group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.') group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') -group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=0, metavar='N', help='Number of layers to offload to the GPU.') +group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.') group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.') group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.') group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') From d2bae7694c0798f9f51bc61a1f7b20d93059f106 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 8 May 2025 07:26:23 -0700 Subject: [PATCH 050/164] UI: Change the ctx-size description --- modules/ui_model_menu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index e05d2256..8dea457e 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -54,7 +54,7 @@ def create_ui(): shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size) shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) - shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.') + shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536, 131072.') shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') From 348d4860c278eda1dedff15c05082e2d3358c3f3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 8 May 2025 07:58:59 -0700 Subject: [PATCH 051/164] UI: Create a "Main options" section in the Model tab --- modules/ui_model_menu.py | 70 ++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 8dea457e..28b7222d 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -47,52 +47,27 @@ def create_ui(): with gr.Column(): shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None) with gr.Blocks(): + gr.Markdown("## Main options") with gr.Row(): with gr.Column(): shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') - shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads) - shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) - shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size) - shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) + shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536, 131072.') shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') - shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40') - shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') - shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags) - shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory) - shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.') - shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.') - shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.') - shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.') - shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.') - shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.') + shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) with gr.Column(): + shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.') + shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.') - shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') - shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') - shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.') - shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk) - shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.') - shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') - shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap) - shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock) - shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.') shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.') - shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.') - shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16) shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.') shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).') - shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn) - shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers) - shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa) - shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.') shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) - shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') @@ -102,11 +77,44 @@ def create_ui(): shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu) ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) - shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.') shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.') + shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding. Recommended value: 4.') shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1') shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.') + gr.Markdown("## Other options") + with gr.Accordion("See more options", open=False, elem_classes='tgw-accordion'): + with gr.Row(): + with gr.Column(): + shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads) + shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) + shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size) + shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40') + shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags) + shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory) + shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.') + shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.') + shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.') + shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.') + shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.') + shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.') + + with gr.Column(): + shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.') + shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk) + shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.') + shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') + shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap) + shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock) + shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.') + shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.') + shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16) + shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn) + shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers) + shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa) + shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') + shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') + with gr.Column(): with gr.Tab("Download"): shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu) From a2ab42d39099d89543a8e5c5753350e51905fa36 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 8 May 2025 08:00:38 -0700 Subject: [PATCH 052/164] UI: Remove the exllamav2 info message --- modules/loaders.py | 1 - modules/ui_model_menu.py | 1 - 2 files changed, 2 deletions(-) diff --git a/modules/loaders.py b/modules/loaders.py index b29679bd..4b76549b 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -84,7 +84,6 @@ loaders_and_params = OrderedDict({ 'no_flash_attn', 'no_xformers', 'no_sdpa', - 'exllamav2_info', 'model_draft', 'draft_max', 'ctx_size_draft', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 28b7222d..33e152a0 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -68,7 +68,6 @@ def create_ui(): shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).') shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.') shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) - shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') # Speculative decoding From 13a434f3518e381d04acd869ed3c0ba3d3823d34 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 8 May 2025 08:06:07 -0700 Subject: [PATCH 053/164] Bump exllamav3 --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_noavx2.txt | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 3b50c674..ac89f45b 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -32,8 +32,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index ba23ea9c..6abdb1a4 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -31,5 +31,5 @@ tiktoken # Mac wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7-py3-none-any.whl +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index c245ab74..682c6a47 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -32,5 +32,5 @@ tiktoken https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7-py3-none-any.whl +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index d8bbf6d1..1e185079 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -32,8 +32,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a7/exllamav3-0.0.1a7+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" From ed6e16191da79523c5cabfd927130b307b3b54b9 Mon Sep 17 00:00:00 2001 From: Scott Z Date: Thu, 8 May 2025 11:21:52 -0400 Subject: [PATCH 054/164] Docker fix for NVIDIA (#6964) --- docker/nvidia/Dockerfile | 2 +- docker/nvidia/docker-compose.yml | 12 +----------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/docker/nvidia/Dockerfile b/docker/nvidia/Dockerfile index 900a4329..82594a26 100644 --- a/docker/nvidia/Dockerfile +++ b/docker/nvidia/Dockerfile @@ -14,7 +14,7 @@ WORKDIR /home/app/ RUN git clone https://github.com/oobabooga/text-generation-webui.git WORKDIR /home/app/text-generation-webui RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose -COPY CMD_FLAGS.txt /home/app/text-generation-webui/ +COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005} WORKDIR /home/app/text-generation-webui # set umask to ensure group read / write at runtime diff --git a/docker/nvidia/docker-compose.yml b/docker/nvidia/docker-compose.yml index 0392078e..23d5cacc 100644 --- a/docker/nvidia/docker-compose.yml +++ b/docker/nvidia/docker-compose.yml @@ -31,17 +31,7 @@ services: stdin_open: true tty: true volumes: - - ./cache:/home/app/text-generation-webui/cache - - ./characters:/home/app/text-generation-webui/characters - - ./extensions:/home/app/text-generation-webui/extensions - - ./loras:/home/app/text-generation-webui/loras - - ./logs:/home/app/text-generation-webui/logs - - ./models:/home/app/text-generation-webui/models - - ./presets:/home/app/text-generation-webui/presets - - ./prompts:/home/app/text-generation-webui/prompts - - ./softprompts:/home/app/text-generation-webui/softprompts - - ./training:/home/app/text-generation-webui/training - - ./cloudflared:/etc/cloudflared + - ./user_data:/home/app/text-generation-webui/user_data deploy: resources: reservations: From fa960496d554ece24c06088607692fa7b874ff5b Mon Sep 17 00:00:00 2001 From: Jonas Date: Thu, 8 May 2025 17:30:27 +0200 Subject: [PATCH 055/164] Tools support for OpenAI compatible API (#6827) --- extensions/openai/completions.py | 73 +++++++++++++++++++++---- extensions/openai/typing.py | 47 +++++++++++++++- extensions/openai/utils.py | 94 ++++++++++++++++++++++++++++++++ modules/chat.py | 12 ++-- 4 files changed, 209 insertions(+), 17 deletions(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index a7d8b4e4..ed0bcc40 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -1,11 +1,14 @@ import copy import time +import json from collections import deque import tiktoken from extensions.openai.errors import InvalidRequestError -from extensions.openai.utils import debug_msg +from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall +from extensions.openai.typing import ToolDefinition +from pydantic import ValidationError from modules import shared from modules.chat import ( generate_chat_prompt, @@ -99,19 +102,24 @@ def convert_history(history): user_input = content user_input_last = True if current_message: - chat_dialogue.append([current_message, '']) + chat_dialogue.append([current_message, '', '']) current_message = "" current_message = content elif role == "assistant": + if "tool_calls" in entry and isinstance(entry["tool_calls"], list) and len(entry["tool_calls"]) > 0 and content.strip() == "": + continue # skip tool calls current_reply = content user_input_last = False if current_message: - chat_dialogue.append([current_message, current_reply]) + chat_dialogue.append([current_message, current_reply, '']) current_message = "" current_reply = "" else: - chat_dialogue.append(['', current_reply]) + chat_dialogue.append(['', current_reply, '']) + elif role == "tool": + user_input_last = False + chat_dialogue.append(['', '', content]) elif role == "system": system_message += f"\n{content}" if system_message else content @@ -131,6 +139,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p if 'messages' not in body: raise InvalidRequestError(message="messages is required", param='messages') + tools = None + if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and len(body['tools']) > 0: + tools = validateTools(body['tools']) # raises InvalidRequestError if validation fails + messages = body['messages'] for m in messages: if 'role' not in m: @@ -188,6 +200,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p 'custom_system_message': custom_system_message, 'chat_template_str': chat_template_str, 'chat-instruct_command': chat_instruct_command, + 'tools': tools, 'history': history, 'stream': stream }) @@ -200,7 +213,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p requested_model = generate_params.pop('model') logprob_proc = generate_params.pop('logprob_proc', None) - def chat_streaming_chunk(content): + def chat_streaming_chunk(content, chunk_tool_calls=None): # begin streaming chunk = { "id": cmpl_id, @@ -210,7 +223,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p resp_list: [{ "index": 0, "finish_reason": None, - "delta": {'role': 'assistant', 'content': content}, + "delta": {'role': 'assistant', 'content': content, 'tool_calls': chunk_tool_calls}, }], } @@ -219,6 +232,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]} # else: # chunk[resp_list][0]["logprobs"] = None + return chunk # generate reply ####################################### @@ -227,8 +241,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p yield {'prompt': prompt} return - debug_msg({'prompt': prompt, 'generate_params': generate_params}) - if stream: yield chat_streaming_chunk('') @@ -238,8 +250,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p answer = '' seen_content = '' + tool_calls = [] + end_last_tool_call = 0 + supported_tools = [x["function"]["name"] for x in tools] if tools is not None else None + for a in generator: answer = a['internal'][-1][1] + + if supported_tools is not None: + tool_call = parseToolCall(answer[end_last_tool_call:], supported_tools) if len(answer) > 0 else [] + if len(tool_call) > 0: + for tc in tool_call: + tc["id"] = getToolCallId() + tc["index"] = str(len(tool_calls)) + tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"]) + tool_calls.append(tc) + end_last_tool_call = len(answer) + if stream: len_seen = len(seen_content) new_content = answer[len_seen:] @@ -247,18 +274,25 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet. continue - seen_content = answer chunk = chat_streaming_chunk(new_content) + + seen_content = answer yield chunk + # stop generation if tool_calls were generated previously + if len(tool_calls) > 0: + break + token_count = len(encode(prompt)[0]) completion_token_count = len(encode(answer)[0]) stop_reason = "stop" + if len(tool_calls) > 0: + stop_reason = "tool_calls" if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']: stop_reason = "length" if stream: - chunk = chat_streaming_chunk('') + chunk = chat_streaming_chunk('', tool_calls) chunk[resp_list][0]['finish_reason'] = stop_reason chunk['usage'] = { "prompt_tokens": token_count, @@ -276,7 +310,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p resp_list: [{ "index": 0, "finish_reason": stop_reason, - "message": {"role": "assistant", "content": answer} + "message": {"role": "assistant", "content": answer}, + "tool_calls": tool_calls }], "usage": { "prompt_tokens": token_count, @@ -465,3 +500,19 @@ def completions(body: dict, is_legacy: bool = False) -> dict: def stream_completions(body: dict, is_legacy: bool = False): for resp in completions_common(body, is_legacy, stream=True): yield resp + + +def validateTools(tools: list[dict]): + # Validate each tool definition in the JSON array + valid_tools = None + for idx in range(len(tools)): + tool = tools[idx] + try: + tool_definition = ToolDefinition(**tool) + if valid_tools is None: + valid_tools = [] + valid_tools.append(tool) + except ValidationError: + raise InvalidRequestError(message=f"Invalid tool specification at index {idx}.", param='tools') + + return valid_tools diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py index b1979cbc..b28ebb4e 100644 --- a/extensions/openai/typing.py +++ b/extensions/openai/typing.py @@ -1,8 +1,8 @@ import json import time -from typing import Dict, List +from typing import Dict, List, Optional -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, validator class GenerationOptions(BaseModel): @@ -54,6 +54,48 @@ class GenerationOptions(BaseModel): grammar_string: str = "" +class ToolDefinition(BaseModel): + function: 'ToolFunction' + type: str + + +class ToolFunction(BaseModel): + description: str + name: str + parameters: 'ToolParameters' + + +class ToolParameters(BaseModel): + properties: Optional[Dict[str, 'ToolProperty']] = None + required: Optional[list[str]] = None + type: str + description: Optional[str] = None + + +class ToolProperty(BaseModel): + description: Optional[str] = None + type: Optional[str] = None # we are faced with definitions like anyOf, e.g. {'type': 'function', 'function': {'name': 'git_create_branch', 'description': 'Creates a new branch from an optional base branch', 'parameters': {'type': 'object', 'properties': {'repo_path': {'title': 'Repo Path', 'type': 'string'}, 'branch_name': {'title': 'Branch Name', 'type': 'string'}, 'base_branch': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'default': None, 'title': 'Base Branch'}}, 'required': ['repo_path', 'branch_name'], 'title': 'GitCreateBranch'}}} + + +class FunctionCall(BaseModel): + name: str + arguments: Optional[str] = None + parameters: Optional[str] = None + + @validator('arguments', allow_reuse=True) + def checkPropertyArgsOrParams(cls, v, values, **kwargs): + if not v and not values.get('parameters'): + raise ValueError("At least one of 'arguments' or 'parameters' must be provided as property in FunctionCall type") + return v + + +class ToolCall(BaseModel): + id: str + index: int + type: str + function: FunctionCall + + class CompletionRequestParams(BaseModel): model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.") prompt: str | List[str] @@ -92,6 +134,7 @@ class ChatCompletionRequestParams(BaseModel): frequency_penalty: float | None = 0 function_call: str | dict | None = Field(default=None, description="Unused parameter.") functions: List[dict] | None = Field(default=None, description="Unused parameter.") + tools: List[dict] | None = Field(default=None, description="Tools signatures passed via MCP.") logit_bias: dict | None = None max_tokens: int | None = None n: int | None = Field(default=1, description="Unused parameter.") diff --git a/extensions/openai/utils.py b/extensions/openai/utils.py index 2b414769..8cb856ff 100644 --- a/extensions/openai/utils.py +++ b/extensions/openai/utils.py @@ -1,6 +1,9 @@ import base64 import os import time +import json +import random +import re import traceback from typing import Callable, Optional @@ -52,3 +55,94 @@ def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_star time.sleep(3) raise Exception('Could not start cloudflared.') + + +def getToolCallId() -> str: + letter_bytes = "abcdefghijklmnopqrstuvwxyz0123456789" + b = [random.choice(letter_bytes) for _ in range(8)] + return "call_" + "".join(b).lower() + + +def checkAndSanitizeToolCallCandidate(candidate_dict: dict, tool_names: list[str]): + # check if property 'function' exists and is a dictionary, otherwise adapt dict + if 'function' not in candidate_dict and 'name' in candidate_dict and isinstance(candidate_dict['name'], str): + candidate_dict = {"type": "function", "function": candidate_dict} + if 'function' in candidate_dict and isinstance(candidate_dict['function'], str): + candidate_dict['name'] = candidate_dict['function'] + del candidate_dict['function'] + candidate_dict = {"type": "function", "function": candidate_dict} + if 'function' in candidate_dict and isinstance(candidate_dict['function'], dict): + # check if 'name' exists within 'function' and is part of known tools + if 'name' in candidate_dict['function'] and candidate_dict['function']['name'] in tool_names: + candidate_dict["type"] = "function" # ensure required property 'type' exists and has the right value + # map property 'parameters' used by some older models to 'arguments' + if "arguments" not in candidate_dict["function"] and "parameters" in candidate_dict["function"]: + candidate_dict["function"]["arguments"] = candidate_dict["function"]["parameters"] + del candidate_dict["function"]["parameters"] + return candidate_dict + return None + + +def parseToolCall(answer: str, tool_names: list[str]): + matches = [] + + # abort on very short answers to save computation cycles + if len(answer) < 10: + return matches + + # Define the regex pattern to find the JSON content wrapped in , , , and other tags observed from various models + patterns = [ r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)" ] + + for pattern in patterns: + for match in re.finditer(pattern, answer, re.DOTALL): + # print(match.group(2)) + if match.group(2) is None: + continue + # remove backtick wraps if present + candidate = re.sub(r"^```(json|xml|python[^\n]*)\n", "", match.group(2).strip()) + candidate = re.sub(r"```$", "", candidate.strip()) + # unwrap inner tags + candidate = re.sub(pattern, r"\2", candidate.strip(), flags=re.DOTALL) + # llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually + if re.search(r"\}\s*\n\s*\{", candidate) is not None: + candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate) + if not candidate.strip().startswith("["): + candidate = "[" + candidate + "]" + + candidates = [] + try: + # parse the candidate JSON into a dictionary + candidates = json.loads(candidate) + if not isinstance(candidates, list): + candidates = [candidates] + except json.JSONDecodeError: + # Ignore invalid JSON silently + continue + + for candidate_dict in candidates: + checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names) + if checked_candidate is not None: + matches.append(checked_candidate) + + # last resort if nothing has been mapped: LLM might have produced plain json tool call without xml-like tags + if len(matches) == 0: + try: + candidate = answer + # llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually + if re.search(r"\}\s*\n\s*\{", candidate) is not None: + candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate) + if not candidate.strip().startswith("["): + candidate = "[" + candidate + "]" + # parse the candidate JSON into a dictionary + candidates = json.loads(candidate) + if not isinstance(candidates, list): + candidates = [candidates] + for candidate_dict in candidates: + checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names) + if checked_candidate is not None: + matches.append(checked_candidate) + except json.JSONDecodeError: + # Ignore invalid JSON silently + pass + + return matches diff --git a/modules/chat.py b/modules/chat.py index feac6bdd..b524b1b9 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -145,7 +145,7 @@ def generate_chat_prompt(user_input, state, **kwargs): instruct_renderer = partial( instruction_template.render, builtin_tools=None, - tools=None, + tools=state['tools'] if 'tools' in state else None, tools_in_user_message=False, add_generation_prompt=False ) @@ -171,9 +171,13 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.append({"role": "system", "content": context}) insert_pos = len(messages) - for user_msg, assistant_msg in reversed(history): - user_msg = user_msg.strip() - assistant_msg = assistant_msg.strip() + for entry in reversed(history): + user_msg = entry[0].strip() + assistant_msg = entry[1].strip() + tool_msg = entry[2].strip() if len(entry) > 2 else '' + + if tool_msg: + messages.insert(insert_pos, {"role": "tool", "content": tool_msg}) if assistant_msg: messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg}) From a1b3307b6636b13373e8f399690cb3b782854d2c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 8 May 2025 08:58:14 -0700 Subject: [PATCH 056/164] Bump llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 16 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index ac89f45b..3a059c91 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 431cd740..ebc33216 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 0c581f86..8ec6898f 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 6abdb1a4..afc869c8 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 682c6a47..8d7d29b7 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -29,8 +29,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index ac277d61..d69aae18 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index cc412d33..540c9ac8 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 1e185079..3bb5a74a 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 1240d335..95319d75 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 6b165b7c..4b49b4e1 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 1b2b5cf2..a6ebda30 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -15,6 +15,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 2793d743..bb5ba8ad 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 6d7316a6..3d17dd49 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index e56eba08..ff9fa04c 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index a7f8c703..e17f8ce7 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 5b427fd2..dd01b3a8 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.9.0/llama_cpp_binaries-0.9.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 1c7209a725c8811f2d4d2325007b2e871c5af020 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 8 May 2025 09:46:43 -0700 Subject: [PATCH 057/164] Save the chat history periodically during streaming --- modules/chat.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index b524b1b9..403d05e1 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -5,6 +5,7 @@ import html import json import pprint import re +import time from datetime import datetime from functools import partial from pathlib import Path @@ -485,10 +486,16 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False): send_dummy_reply(state['start_with'], state) history = state['history'] + last_save_time = time.monotonic() + save_interval = 8 for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)): yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history - if i == 0: + + current_time = time.monotonic() + # Save on first iteration or if save_interval seconds have passed + if i == 0 or (current_time - last_save_time) >= save_interval: save_history(history, state['unique_id'], state['character_menu'], state['mode']) + last_save_time = current_time save_history(history, state['unique_id'], state['character_menu'], state['mode']) From 3bc2ec2b119c058446f9e9600213c75302a4ac4f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 8 May 2025 10:34:09 -0700 Subject: [PATCH 058/164] Fix #6965 --- one_click.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/one_click.py b/one_click.py index cb16b813..482a6aa9 100644 --- a/one_click.py +++ b/one_click.py @@ -126,7 +126,7 @@ def check_env(): sys.exit(1) # Ensure this is a new environment and not the base environment - if os.environ["CONDA_DEFAULT_ENV"] == "base": + if os.environ.get("CONDA_DEFAULT_ENV", "") == "base": print("Create an environment for this project and activate it. Exiting...") sys.exit(1) From 9ea2a69210ab5658ba8daf6d7d604589de5fc741 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 8 May 2025 10:41:25 -0700 Subject: [PATCH 059/164] llama.cpp: Add --no-webui to the llama-server command --- modules/llama_cpp_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index d8d2f61b..1046969a 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -261,6 +261,7 @@ class LlamaServer: "--gpu-layers", str(shared.args.gpu_layers), "--batch-size", str(shared.args.batch_size), "--port", str(self.port), + "--no-webui", ] if shared.args.flash_attn: From bf7e4a4597b6492b4c440d32a8afbda59d4ef035 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 8 May 2025 16:12:07 -0700 Subject: [PATCH 060/164] Docs: Add a tool/function calling example (from https://github.com/oobabooga/text-generation-webui/pull/6827#issuecomment-2854716960) --- docs/12 - OpenAI API.md | 79 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md index 364c6b09..db9befed 100644 --- a/docs/12 - OpenAI API.md +++ b/docs/12 - OpenAI API.md @@ -257,6 +257,85 @@ headers = { in any of the examples above. +#### Tool/Function Calling Example + +You need to use a model with tools support. The prompt will be automatically formatted using the model's Jinja2 template. + +Request: + +``` +curl http://127.0.0.1:5000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "What time is it currently in New York City?" + } + ], + "tools": [ + { + "type": "function", + "function": { + "name": "get_current_time", + "description": "Get current time in a specific timezones", + "parameters": { + "type": "object", + "required": ["timezone"], + "properties": { + "timezone": { + "type": "string", + "description": "IANA timezone name (e.g., America/New_York, Europe/London). Use Europe/Berlin as local timezone if no timezone provided by the user." + } + } + } + } + } + ] + }' +``` + +Sample response: + +``` +{ + "id": "chatcmpl-1746532051477984256", + "object": "chat.completion", + "created": 1746532051, + "model": "qwen2.5-coder-14b-instruct-q4_k_m.gguf", + "choices": [ + { + "index": 0, + "finish_reason": "tool_calls", + "message": { + "role": "assistant", + "content": "```xml\n\n{\n \"name\": \"get_current_time\",\n \"arguments\": {\n \"timezone\": \"America/New_York\"\n }\n}\n\n```" + }, + "tool_calls": [ + { + "type": "function", + "function": { + "name": "get_current_time", + "arguments": "{\"timezone\": \"America/New_York\"}" + }, + "id": "call_52ij07mh", + "index": "0" + } + ] + } + ], + "usage": { + "prompt_tokens": 224, + "completion_tokens": 38, + "total_tokens": 262 + } +} +``` + ### Environment variables The following environment variables can be used (they take precedence over everything else): From f8ef6e09af5d2e28cf67d1eea165591e156ac9d2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 8 May 2025 18:19:04 -0700 Subject: [PATCH 061/164] UI: Make ctx-size a slider --- modules/ui_model_menu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 33e152a0..d4d9b8b1 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -51,8 +51,8 @@ def create_ui(): with gr.Row(): with gr.Column(): shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') + shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') - shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536, 131072.') shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) From 512bc2d0e02bef2434370c2317bcf56e50f0513f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 8 May 2025 23:43:55 -0700 Subject: [PATCH 062/164] UI: Update some labels --- modules/ui_model_menu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index d4d9b8b1..1e27255b 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -53,12 +53,12 @@ def create_ui(): shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') - shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') + shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) with gr.Column(): - shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.') - shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') + shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.') + shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.') From 2bde625d5716355b30fdd414c9b104812b101ed1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 9 May 2025 00:19:25 -0700 Subject: [PATCH 063/164] Update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6cc84c50..0833f9b0 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these. - Multiple sampling parameters and generation options for sophisticated text generation control. - Switch between different models easily in the UI without restarting, with fine control over settings. -- OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples). +- OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples). - 100% offline and private, with zero telemetry, external resources, or remote update requests. - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details. From 8984e95c671c262b1667805895d317a9ffe9cd0a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 9 May 2025 07:21:05 -0700 Subject: [PATCH 064/164] UI: More friendly message when no model is loaded --- modules/logits.py | 7 ++++--- modules/text_generation.py | 5 +++-- modules/utils.py | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/modules/logits.py b/modules/logits.py index 32aef7ae..56a20572 100644 --- a/modules/logits.py +++ b/modules/logits.py @@ -7,6 +7,7 @@ from modules import models, shared from modules.logging_colors import logger from modules.models import load_model from modules.text_generation import generate_reply +from modules.utils import check_model_loaded global_scores = None @@ -33,9 +34,9 @@ def get_next_logits(*args, **kwargs): def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False): - if shared.model is None: - logger.error("No model is loaded! Select one in the Model tab.") - return 'Error: No model is loaded1 Select one in the Model tab.', previous + model_is_loaded, error_message = check_model_loaded() + if not model_is_loaded: + return error_message, previous # llama.cpp case if shared.model.__class__.__name__ == 'LlamaServer': diff --git a/modules/text_generation.py b/modules/text_generation.py index 7e48a2f6..c0c0350d 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -14,6 +14,7 @@ from modules.callbacks import Iteratorize from modules.extensions import apply_extensions from modules.html_generator import generate_basic_html from modules.logging_colors import logger +from modules.utils import check_model_loaded def generate_reply(*args, **kwargs): @@ -34,8 +35,8 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap # Find the appropriate generation function generate_func = apply_extensions('custom_generate_reply') if generate_func is None: - if shared.model_name == 'None' or shared.model is None: - logger.error("No model is loaded! Select one in the Model tab.") + model_is_loaded, error_message = check_model_loaded() + if not model_is_loaded: yield '' return diff --git a/modules/utils.py b/modules/utils.py index 77324139..0e390d08 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -72,6 +72,20 @@ def natural_keys(text): return [atoi(c) for c in re.split(r'(\d+)', text)] +def check_model_loaded(): + if shared.model_name == 'None' or shared.model is None: + if len(get_available_models()) <= 1: + error_msg = "No model is loaded.\n\nTo get started:\n1) Place a GGUF file in your user_data/models folder\n2) Go to the Model tab and select it" + logger.error(error_msg) + return False, error_msg + else: + error_msg = "No model is loaded. Please select one in the Model tab." + logger.error(error_msg) + return False, error_msg + + return True, None + + def get_available_models(): # Get all GGUF files gguf_files = get_available_ggufs() From 4920981b140862f3b085f614b83269e6ac228605 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 9 May 2025 20:35:38 -0700 Subject: [PATCH 065/164] UI: Remove the typing cursor --- modules/chat.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 403d05e1..b83c4bfe 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -399,16 +399,13 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess # Extract the reply if state['mode'] in ['chat', 'chat-instruct']: - visible_reply = re.sub("(||{{user}})", state['name1'], reply + '▍') + visible_reply = re.sub("(||{{user}})", state['name1'], reply) else: - visible_reply = reply + '▍' + visible_reply = reply visible_reply = html.escape(visible_reply) if shared.stop_everything: - if output['visible'][-1][1].endswith('▍'): - output['visible'][-1][1] = output['visible'][-1][1][:-1] - output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True) yield output return @@ -424,9 +421,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess if is_stream: yield output - if output['visible'][-1][1].endswith('▍'): - output['visible'][-1][1] = output['visible'][-1][1][:-1] - output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True) yield output From 47d47585095da3a76988eabe52765a332a668d55 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 10 May 2025 17:46:00 -0700 Subject: [PATCH 066/164] Fix #6970 --- modules/shared.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/modules/shared.py b/modules/shared.py index f2698bd2..4e0a20db 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -128,9 +128,9 @@ group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') # Cache -group = parser.add_argument_group('Context and cache management') +group = parser.add_argument_group('Context and cache') group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.') -group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).') +group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).') # Speculative decoding group = parser.add_argument_group('Speculative decoding') @@ -159,10 +159,6 @@ group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='B group = parser.add_argument_group('TensorRT-LLM') group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.') -# Cache -group = parser.add_argument_group('Cache') -group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.') - # DeepSpeed group = parser.add_argument_group('DeepSpeed') group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.') From 006a866079d4a719f4405efdbbe18c03e106c541 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 10 May 2025 17:55:48 -0700 Subject: [PATCH 067/164] Fix API failing to cancel streams (attempt), closes #6966 --- extensions/openai/script.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/extensions/openai/script.py b/extensions/openai/script.py index a995da9d..66f38501 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -118,6 +118,7 @@ async def openai_completions(request: Request, request_data: CompletionRequest): for resp in response: disconnected = await request.is_disconnected() if disconnected: + stop_everything_event() break yield {"data": json.dumps(resp)} @@ -141,6 +142,7 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion for resp in response: disconnected = await request.is_disconnected() if disconnected: + stop_everything_event() break yield {"data": json.dumps(resp)} From 0c5fa3728e8f0505692966f7a296e6561566c7bd Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 10 May 2025 19:12:40 -0700 Subject: [PATCH 068/164] Revert "Fix API failing to cancel streams (attempt), closes #6966" This reverts commit 006a866079d4a719f4405efdbbe18c03e106c541. --- extensions/openai/script.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/extensions/openai/script.py b/extensions/openai/script.py index 66f38501..a995da9d 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -118,7 +118,6 @@ async def openai_completions(request: Request, request_data: CompletionRequest): for resp in response: disconnected = await request.is_disconnected() if disconnected: - stop_everything_event() break yield {"data": json.dumps(resp)} @@ -142,7 +141,6 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion for resp in response: disconnected = await request.is_disconnected() if disconnected: - stop_everything_event() break yield {"data": json.dumps(resp)} From e7ac06c1694024594450437f3b899e32ab2ce6e4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 10 May 2025 19:20:04 -0700 Subject: [PATCH 069/164] New attempt --- modules/llama_cpp_server.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 1046969a..615f29ad 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -146,8 +146,9 @@ class LlamaServer: pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) print() - # Make a direct request with streaming enabled using a context manager - with self.session.post(url, json=payload, stream=True) as response: + # Make a request with streaming enabled + response = self.session.post(url, json=payload, stream=True) + try: response.raise_for_status() # Raise an exception for HTTP errors full_text = "" @@ -185,6 +186,9 @@ class LlamaServer: print(f"Problematic line: {line}") continue + finally: + response.close() + def generate(self, prompt, state): output = "" for output in self.generate_with_streaming(prompt, state): From 62c774bf24d35a1ebdcdb9927f8a6c6ae3949c82 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 13 May 2025 06:42:25 -0700 Subject: [PATCH 070/164] Revert "New attempt" This reverts commit e7ac06c1694024594450437f3b899e32ab2ce6e4. --- modules/llama_cpp_server.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 615f29ad..1046969a 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -146,9 +146,8 @@ class LlamaServer: pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) print() - # Make a request with streaming enabled - response = self.session.post(url, json=payload, stream=True) - try: + # Make a direct request with streaming enabled using a context manager + with self.session.post(url, json=payload, stream=True) as response: response.raise_for_status() # Raise an exception for HTTP errors full_text = "" @@ -186,9 +185,6 @@ class LlamaServer: print(f"Problematic line: {line}") continue - finally: - response.close() - def generate(self, prompt, state): output = "" for output in self.generate_with_streaming(prompt, state): From c375b6941395454bef52d9ac0e102c0de3f4d3ee Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 13 May 2025 11:23:33 -0700 Subject: [PATCH 071/164] API: Fix llama.cpp generating after disconnect, improve disconnect detection, fix deadlock on simultaneous requests --- extensions/openai/script.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/extensions/openai/script.py b/extensions/openai/script.py index a995da9d..2b4f274f 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -14,6 +14,7 @@ from fastapi.requests import Request from fastapi.responses import JSONResponse from pydub import AudioSegment from sse_starlette import EventSourceResponse +from starlette.concurrency import iterate_in_threadpool import extensions.openai.completions as OAIcompletions import extensions.openai.images as OAIimages @@ -115,7 +116,7 @@ async def openai_completions(request: Request, request_data: CompletionRequest): async def generator(): async with streaming_semaphore: response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy) - for resp in response: + async for resp in iterate_in_threadpool(response): disconnected = await request.is_disconnected() if disconnected: break @@ -125,7 +126,12 @@ async def openai_completions(request: Request, request_data: CompletionRequest): return EventSourceResponse(generator()) # SSE streaming else: - response = OAIcompletions.completions(to_dict(request_data), is_legacy=is_legacy) + response = await asyncio.to_thread( + OAIcompletions.completions, + to_dict(request_data), + is_legacy=is_legacy + ) + return JSONResponse(response) @@ -138,7 +144,7 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion async def generator(): async with streaming_semaphore: response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy) - for resp in response: + async for resp in iterate_in_threadpool(response): disconnected = await request.is_disconnected() if disconnected: break @@ -148,7 +154,12 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion return EventSourceResponse(generator()) # SSE streaming else: - response = OAIcompletions.chat_completions(to_dict(request_data), is_legacy=is_legacy) + response = await asyncio.to_thread( + OAIcompletions.chat_completions, + to_dict(request_data), + is_legacy=is_legacy + ) + return JSONResponse(response) From 3fa1a899aea3ff2700a20a8bc2da17202d3065e5 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 13 May 2025 12:07:59 -0700 Subject: [PATCH 072/164] UI: Fix gpu-layers being ignored (closes #6973) --- modules/loaders.py | 2 +- modules/models_settings.py | 2 +- modules/ui.py | 2 +- modules/ui_model_menu.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/loaders.py b/modules/loaders.py index 4b76549b..583b65c2 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -5,7 +5,7 @@ import gradio as gr loaders_and_params = OrderedDict({ 'llama.cpp': [ - 'n_gpu_layers', + 'gpu_layers', 'threads', 'threads_batch', 'batch_size', diff --git a/modules/models_settings.py b/modules/models_settings.py index ae589bb3..4418e3fb 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -67,7 +67,7 @@ def get_model_metadata(model): elif k.endswith('rope.scaling.factor'): model_settings['compress_pos_emb'] = metadata[k] elif k.endswith('block_count'): - model_settings['n_gpu_layers'] = metadata[k] + 1 + model_settings['gpu_layers'] = metadata[k] + 1 if 'tokenizer.chat_template' in metadata: template = metadata['tokenizer.chat_template'] diff --git a/modules/ui.py b/modules/ui.py index b3d4bccf..eeb6ce92 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -105,7 +105,7 @@ def list_model_elements(): 'filter_by_loader', 'loader', 'cpu_memory', - 'n_gpu_layers', + 'gpu_layers', 'threads', 'threads_batch', 'batch_size', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 1e27255b..b63a127c 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -50,7 +50,7 @@ def create_ui(): gr.Markdown("## Main options") with gr.Row(): with gr.Column(): - shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') + shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') From 2826c60044a05f316510ef93546b5dbff59b3864 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 13 May 2025 14:45:46 -0700 Subject: [PATCH 073/164] Use logger for "Output generated in ..." messages --- modules/text_generation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/text_generation.py b/modules/text_generation.py index c0c0350d..00b9275a 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -470,7 +470,7 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None, t1 = time.time() original_tokens = len(original_input_ids[0]) new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0) - print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})') + logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})') return @@ -499,7 +499,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N t1 = time.time() original_tokens = len(encode(original_question)[0]) new_tokens = len(encode(original_question + reply)[0]) - original_tokens - print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {state["seed"]})') + logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {state["seed"]})') return From 035cd3e2a906a6094d0f1f298df49c0152f1a2ee Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 13 May 2025 20:09:22 -0700 Subject: [PATCH 074/164] UI: Hide the extension install menu in portable builds --- modules/ui_session.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/modules/ui_session.py b/modules/ui_session.py index 7cf9f6e6..a4eba667 100644 --- a/modules/ui_session.py +++ b/modules/ui_session.py @@ -23,11 +23,15 @@ def create_ui(): shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table') with gr.Column(): - extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️ WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu) - extension_status = gr.Markdown() + if not shared.args.portable: + extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️ WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu) + extension_status = gr.Markdown() + else: + pass shared.gradio['theme_state'] = gr.Textbox(visible=False, value='dark' if shared.settings['dark_theme'] else 'light') - extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False) + if not shared.args.portable: + extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False) # Reset interface event shared.gradio['reset_interface'].click( From c4a715fd1e86e52e3350f8126847524b488a04e2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 13 May 2025 20:14:09 -0700 Subject: [PATCH 075/164] UI: Move the LoRA menu under "Other options" --- modules/ui_model_menu.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index b63a127c..81ad1a53 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -26,25 +26,12 @@ def create_ui(): with gr.Row(): with gr.Column(): with gr.Row(): - with gr.Column(): - with gr.Row(): - shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu) - ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) - shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu) - shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu) - shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu) + shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu) + ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) + shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu) + shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu) + shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu) - with gr.Column(): - if shared.args.portable: - pass - else: - with gr.Row(): - shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu) - ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu) - shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu) - - with gr.Row(): - with gr.Column(): shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None) with gr.Blocks(): gr.Markdown("## Main options") @@ -113,6 +100,11 @@ def create_ui(): shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa) shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') + with gr.Row(): + shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu) + ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu) + shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu) + with gr.Column(): with gr.Tab("Download"): From 5534d01da0913d315709a6adacd075639a6cffec Mon Sep 17 00:00:00 2001 From: oobabooga Date: Fri, 16 May 2025 00:07:37 -0300 Subject: [PATCH 076/164] Estimate the VRAM for GGUF models + autoset `gpu-layers` (#6980) --- css/main.css | 14 +++- modules/llama_cpp_server.py | 3 + modules/models.py | 1 - modules/models_settings.py | 151 +++++++++++++++++++++++++++++++++++- modules/ui_model_menu.py | 16 +++- server.py | 12 +++ 6 files changed, 193 insertions(+), 4 deletions(-) diff --git a/css/main.css b/css/main.css index 30089aca..0902b184 100644 --- a/css/main.css +++ b/css/main.css @@ -569,7 +569,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } .dark .message-body :not(pre) > code { - background-color: rgb(255 255 255 / 12.5%); + background-color: rgb(255 255 255 / 10%); } #chat-input { @@ -1386,3 +1386,15 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { strong { font-weight: bold; } + +.min.svelte-1ybaih5 { + min-height: 0; +} + +#vram-info .value { + color: #008d00; +} + +.dark #vram-info .value { + color: #07ff07; +} diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 1046969a..3fc7a0cc 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -282,8 +282,10 @@ class LlamaServer: cmd.append("--no-kv-offload") if shared.args.row_split: cmd += ["--split-mode", "row"] + cache_type = "fp16" if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types: cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type] + cache_type = shared.args.cache_type if shared.args.compress_pos_emb != 1: cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)] if shared.args.rope_freq_base > 0: @@ -343,6 +345,7 @@ class LlamaServer: print(' '.join(str(item) for item in cmd[1:])) print() + logger.info(f"Using gpu_layers={shared.args.gpu_layers} | ctx_size={shared.args.ctx_size} | cache_type={cache_type}") # Start the server with pipes for output self.process = subprocess.Popen( cmd, diff --git a/modules/models.py b/modules/models.py index d0b0402a..9ecee803 100644 --- a/modules/models.py +++ b/modules/models.py @@ -71,7 +71,6 @@ def llama_cpp_server_loader(model_name): else: model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0] - logger.info(f"llama.cpp weights detected: \"{model_file}\"") try: model = LlamaServer(model_file) return model, model diff --git a/modules/models_settings.py b/modules/models_settings.py index 4418e3fb..a8e17594 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -1,7 +1,11 @@ +import functools import json import re +import subprocess +from math import exp from pathlib import Path +import gradio as gr import yaml from modules import chat, loaders, metadata_gguf, shared, ui @@ -216,7 +220,17 @@ def apply_model_settings_to_state(model, state): for k in model_settings: if k in state: - state[k] = model_settings[k] + if k == 'gpu_layers': + available_vram = get_nvidia_free_vram() + n_layers = model_settings[k] + if available_vram > 0: + tolerance = 906 + while n_layers > 0 and estimate_vram(model, n_layers, state['ctx_size'], state['cache_type']) > available_vram - tolerance: + n_layers -= 1 + + state[k] = gr.update(value=n_layers, maximum=model_settings[k]) + else: + state[k] = model_settings[k] return state @@ -277,3 +291,138 @@ def save_instruction_template(model, template): yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.") else: yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.") + + +@functools.lru_cache(maxsize=None) +def get_gguf_metadata_cached(model_file): + return metadata_gguf.load_metadata(model_file) + + +def get_model_size_mb(model_file: Path) -> float: + filename = model_file.name + + # Check for multipart pattern + match = re.match(r'(.+)-\d+-of-\d+\.gguf$', filename) + + if match: + # It's a multipart file, find all matching parts + base_pattern = match.group(1) + part_files = sorted(model_file.parent.glob(f'{base_pattern}-*-of-*.gguf')) + total_size = sum(p.stat().st_size for p in part_files) + else: + # Single part + total_size = model_file.stat().st_size + + return total_size / (1024 ** 2) # Return size in MB + + +def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): + model_file = Path(f'{shared.args.model_dir}/{gguf_file}') + metadata = get_gguf_metadata_cached(model_file) + size_in_mb = get_model_size_mb(model_file) + + # Extract values from metadata + n_layers = None + n_kv_heads = None + embedding_dim = None + context_length = None + feed_forward_dim = None + + for key, value in metadata.items(): + if key.endswith('.block_count'): + n_layers = value + elif key.endswith('.attention.head_count_kv'): + n_kv_heads = value + elif key.endswith('.embedding_length'): + embedding_dim = value + elif key.endswith('.context_length'): + context_length = value + elif key.endswith('.feed_forward_length'): + feed_forward_dim = value + + if gpu_layers > n_layers: + gpu_layers = n_layers + + # Convert cache_type to numeric + if cache_type == 'q4_0': + cache_type = 4 + elif cache_type == 'q8_0': + cache_type = 8 + else: + cache_type = 16 + + # Derived features + size_per_layer = size_in_mb / max(n_layers, 1e-6) + context_per_layer = context_length / max(n_layers, 1e-6) + ffn_per_embedding = feed_forward_dim / max(embedding_dim, 1e-6) + kv_cache_factor = n_kv_heads * cache_type * ctx_size + + # Helper function for smaller + def smaller(x, y): + return 1 if x < y else 0 + + # Calculate VRAM using the model + # Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/ + vram = ( + (size_per_layer - 21.19195204848197) + * exp(0.0001047328491557063 * size_in_mb * smaller(ffn_per_embedding, 2.671096993407845)) + + 0.0006621544775632052 * context_per_layer + + 3.34664386576376e-05 * kv_cache_factor + ) * (1.363306170123392 + gpu_layers) + 1255.163594536052 + + return vram + + +def get_nvidia_free_vram(): + """ + Calculates the total free VRAM across all NVIDIA GPUs by parsing nvidia-smi output. + + Returns: + int: The total free VRAM in MiB summed across all detected NVIDIA GPUs. + Returns -1 if nvidia-smi command fails (not found, error, etc.). + Returns 0 if nvidia-smi succeeds but no GPU memory info found. + """ + try: + # Execute nvidia-smi command + result = subprocess.run( + ['nvidia-smi'], + capture_output=True, + text=True, + check=False + ) + + # Check if nvidia-smi returned an error + if result.returncode != 0: + return -1 + + # Parse the output for memory usage patterns + output = result.stdout + + # Find memory usage like "XXXXMiB / YYYYMiB" + # Captures used and total memory for each GPU + matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output) + + if not matches: + # No GPUs found in expected format + return 0 + + total_free_vram_mib = 0 + for used_mem_str, total_mem_str in matches: + try: + used_mib = int(used_mem_str) + total_mib = int(total_mem_str) + total_free_vram_mib += (total_mib - used_mib) + except ValueError: + # Skip malformed entries + pass + + return total_free_vram_mib + + except FileNotFoundError: + raise + # nvidia-smi not found (likely no NVIDIA drivers installed) + return -1 + except Exception: + raise + # Handle any other unexpected exceptions + return -1 diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 81ad1a53..2353f39c 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -11,6 +11,7 @@ from modules.LoRA import add_lora_to_model from modules.models import load_model, unload_model from modules.models_settings import ( apply_model_settings_to_state, + estimate_vram, get_model_metadata, save_instruction_template, save_model_settings, @@ -44,6 +45,7 @@ def create_ui(): shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) with gr.Column(): + shared.gradio['vram_info'] = gr.HTML(value=lambda: estimate_vram_wrapper(shared.args.model, shared.args.gpu_layers, shared.args.ctx_size, shared.args.cache_type)) shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) @@ -105,7 +107,6 @@ def create_ui(): ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu) shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu) - with gr.Column(): with gr.Tab("Download"): shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu) @@ -148,6 +149,11 @@ def create_event_handlers(): ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False) + shared.gradio['model_menu'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) + shared.gradio['gpu_layers'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) + shared.gradio['ctx_size'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) + shared.gradio['cache_type'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) + if not shared.args.portable: shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False) @@ -275,6 +281,14 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur yield traceback.format_exc().replace('\n', '\n\n') +def estimate_vram_wrapper(model, gpu_layers, ctx_size, cache_type): + if model in ["None", None]: + return "
Estimated VRAM to load the model:" + + result = estimate_vram(model, gpu_layers, ctx_size, cache_type) + return f"
Estimated VRAM to load the model: {result:.0f} MiB" + + def update_truncation_length(current_length, state): if 'loader' in state: if state['loader'].lower().startswith('exllama') or state['loader'] == 'llama.cpp': diff --git a/server.py b/server.py index b0b9e633..c35d65a8 100644 --- a/server.py +++ b/server.py @@ -49,8 +49,10 @@ from modules.extensions import apply_extensions from modules.LoRA import add_lora_to_model from modules.models import load_model, unload_model_if_idle from modules.models_settings import ( + estimate_vram, get_fallback_settings, get_model_metadata, + get_nvidia_free_vram, update_model_parameters ) from modules.shared import do_cmd_flags_warnings @@ -248,6 +250,16 @@ if __name__ == "__main__": model_settings = get_model_metadata(model_name) update_model_parameters(model_settings, initial=True) # hijack the command-line arguments + if 'gpu_layers' not in shared.provided_arguments: + available_vram = get_nvidia_free_vram() + if available_vram > 0: + n_layers = model_settings['gpu_layers'] + tolerance = 906 + while n_layers > 0 and estimate_vram(model_name, n_layers, shared.args.ctx_size, shared.args.cache_type) > available_vram - tolerance: + n_layers -= 1 + + shared.args.gpu_layers = n_layers + # Load the model shared.model, shared.tokenizer = load_model(model_name) if shared.args.lora: From 041248cc9f321aa6ff2e706083cdb776e3bf8d21 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 15 May 2025 20:10:02 -0700 Subject: [PATCH 077/164] Update llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 16 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 3a059c91..45bb5c85 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index ebc33216..4e011989 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 8ec6898f..a3bd1350 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index afc869c8..a52f2d64 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 8d7d29b7..929b1d86 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -29,8 +29,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index d69aae18..bd7c4a4f 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 540c9ac8..b5aa1cf7 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 3bb5a74a..bc320c27 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 95319d75..79959398 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 4b49b4e1..ca16e4c7 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index a6ebda30..18e1c506 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -15,6 +15,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index bb5ba8ad..693f4712 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 3d17dd49..8635d11e 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index ff9fa04c..e844596e 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index e17f8ce7..9b7435d1 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index dd01b3a8..513b7a15 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.11.0/llama_cpp_binaries-0.11.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 8cb73b78e14154040ac4a2f7dd33dc7d46121108 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 15 May 2025 20:10:34 -0700 Subject: [PATCH 078/164] Update ExLlamaV3 --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_noavx2.txt | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 45bb5c85..af5f7d8a 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -32,8 +32,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index a52f2d64..363365bf 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -31,5 +31,5 @@ tiktoken # Mac wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 929b1d86..2843fed2 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -32,5 +32,5 @@ tiktoken https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8-py3-none-any.whl +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index bc320c27..89947cbe 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -32,8 +32,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a8/exllamav3-0.0.1a8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" From fd612979330ee0009ccbb14ac5bff894b675bb82 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 15 May 2025 21:19:19 -0700 Subject: [PATCH 079/164] Lint --- extensions/openai/completions.py | 8 ++++---- extensions/openai/utils.py | 6 +++--- extensions/superboogav2/chromadb.py | 3 ++- modules/tensorrt_llm.py | 6 +++--- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index ed0bcc40..5181b18b 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -1,14 +1,14 @@ import copy -import time import json +import time from collections import deque import tiktoken +from pydantic import ValidationError from extensions.openai.errors import InvalidRequestError -from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall from extensions.openai.typing import ToolDefinition -from pydantic import ValidationError +from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall from modules import shared from modules.chat import ( generate_chat_prompt, @@ -141,7 +141,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p tools = None if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and len(body['tools']) > 0: - tools = validateTools(body['tools']) # raises InvalidRequestError if validation fails + tools = validateTools(body['tools']) # raises InvalidRequestError if validation fails messages = body['messages'] for m in messages: diff --git a/extensions/openai/utils.py b/extensions/openai/utils.py index 8cb856ff..9a1de2e7 100644 --- a/extensions/openai/utils.py +++ b/extensions/openai/utils.py @@ -1,9 +1,9 @@ import base64 -import os -import time import json +import os import random import re +import time import traceback from typing import Callable, Optional @@ -91,7 +91,7 @@ def parseToolCall(answer: str, tool_names: list[str]): return matches # Define the regex pattern to find the JSON content wrapped in , , , and other tags observed from various models - patterns = [ r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)" ] + patterns = [r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)"] for pattern in patterns: for match in re.finditer(pattern, answer, re.DOTALL): diff --git a/extensions/superboogav2/chromadb.py b/extensions/superboogav2/chromadb.py index f4f77821..9344e25c 100644 --- a/extensions/superboogav2/chromadb.py +++ b/extensions/superboogav2/chromadb.py @@ -1,10 +1,11 @@ import math import random import threading -import torch + import chromadb import numpy as np import posthog +import torch from chromadb.config import Settings from chromadb.utils import embedding_functions diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py index 73178c39..0527d493 100644 --- a/modules/tensorrt_llm.py +++ b/modules/tensorrt_llm.py @@ -1,15 +1,15 @@ from pathlib import Path -import torch - import tensorrt_llm +import torch +from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp + from modules import shared from modules.logging_colors import logger from modules.text_generation import ( get_max_prompt_length, get_reply_from_output_ids ) -from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp class TensorRTLLMModel: From cbf4daf1c8d149206da80892ced0220cf858ebb7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 15 May 2025 21:21:54 -0700 Subject: [PATCH 080/164] Hide the LoRA menu in portable mode --- modules/ui_model_menu.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 2353f39c..a1911124 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -102,10 +102,11 @@ def create_ui(): shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa) shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') - with gr.Row(): - shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu) - ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu) - shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu) + if not shared.args.portable: + with gr.Row(): + shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu) + ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu) + shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu) with gr.Column(): with gr.Tab("Download"): From 93e1850a2c1eef8fe914bd020dde3e94d6b54f6c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 15 May 2025 21:42:15 -0700 Subject: [PATCH 081/164] Only show the VRAM info for llama.cpp --- modules/loaders.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/loaders.py b/modules/loaders.py index 583b65c2..79a7a4a3 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -28,6 +28,7 @@ loaders_and_params = OrderedDict({ 'device_draft', 'ctx_size_draft', 'speculative_decoding_accordion', + 'vram_info', ], 'Transformers': [ 'gpu_split', From 4925c307cfc97c1ca549b71db6f1aaaf82fd4fb2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 09:07:38 -0700 Subject: [PATCH 082/164] Auto-adjust GPU layers on context size and cache type changes + many fixes --- modules/models_settings.py | 78 +++++++++++++++++++++++++++++++------- modules/ui_model_menu.py | 46 ++++++++++++++-------- server.py | 23 ++++++----- 3 files changed, 109 insertions(+), 38 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index a8e17594..6ea6660c 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -213,24 +213,26 @@ def apply_model_settings_to_state(model, state): model_settings = get_model_metadata(model) if 'loader' in model_settings: loader = model_settings.pop('loader') - - # If the user is using an alternative loader for the same model type, let them keep using it if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']): state['loader'] = loader for k in model_settings: - if k in state: - if k == 'gpu_layers': - available_vram = get_nvidia_free_vram() - n_layers = model_settings[k] - if available_vram > 0: - tolerance = 906 - while n_layers > 0 and estimate_vram(model, n_layers, state['ctx_size'], state['cache_type']) > available_vram - tolerance: - n_layers -= 1 + if k in state and k != 'gpu_layers': # Skip gpu_layers, handle separately + state[k] = model_settings[k] - state[k] = gr.update(value=n_layers, maximum=model_settings[k]) - else: - state[k] = model_settings[k] + # Handle GPU layers and VRAM update for llama.cpp + if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings: + vram_info, gpu_layers_update = update_gpu_layers_and_vram( + state['loader'], + model, + model_settings['gpu_layers'], + state['ctx_size'], + state['cache_type'], + auto_adjust=True + ) + + state['gpu_layers'] = gpu_layers_update + state['vram_info'] = vram_info return state @@ -426,3 +428,53 @@ def get_nvidia_free_vram(): raise # Handle any other unexpected exceptions return -1 + + +def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True): + """ + Unified function to handle GPU layers and VRAM updates. + + Args: + for_ui: If True, returns Gradio updates. If False, returns raw values. + + Returns: + - If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update + - If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage + """ + if loader != 'llama.cpp' or model in ["None", None]: + vram_info = "
Estimated VRAM to load the model:" + if for_ui: + return (vram_info, gr.update()) if auto_adjust else vram_info + else: + return (0, gpu_layers) if auto_adjust else 0 + + current_layers = gpu_layers + max_layers = gpu_layers + + if auto_adjust: + # Get max layers from model metadata + model_settings = get_model_metadata(model) + max_layers = model_settings.get('gpu_layers', gpu_layers) + + # Auto-adjust based on available VRAM + available_vram = get_nvidia_free_vram() + if available_vram > 0: + tolerance = 906 + current_layers = max_layers + while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance: + current_layers -= 1 + + # Calculate VRAM with current layers + vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type) + + if for_ui: + vram_info = f"
Estimated VRAM to load the model: {vram_usage:.0f} MiB" + if auto_adjust: + return vram_info, gr.update(value=current_layers, maximum=max_layers) + else: + return vram_info + else: + if auto_adjust: + return vram_usage, current_layers + else: + return vram_usage diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index a1911124..b6febb50 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -11,10 +11,10 @@ from modules.LoRA import add_lora_to_model from modules.models import load_model, unload_model from modules.models_settings import ( apply_model_settings_to_state, - estimate_vram, get_model_metadata, save_instruction_template, save_model_settings, + update_gpu_layers_and_vram, update_model_parameters ) from modules.utils import gradio @@ -45,7 +45,7 @@ def create_ui(): shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) with gr.Column(): - shared.gradio['vram_info'] = gr.HTML(value=lambda: estimate_vram_wrapper(shared.args.model, shared.args.gpu_layers, shared.args.ctx_size, shared.args.cache_type)) + shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info()) shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) @@ -150,10 +150,18 @@ def create_event_handlers(): ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False) - shared.gradio['model_menu'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) - shared.gradio['gpu_layers'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) - shared.gradio['ctx_size'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) - shared.gradio['cache_type'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) + # For ctx_size and cache_type - auto-adjust GPU layers + for param in ['ctx_size', 'cache_type']: + shared.gradio[param].change( + partial(update_gpu_layers_and_vram, auto_adjust=True), + gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), + gradio('vram_info', 'gpu_layers'), show_progress=False) + + # For manual gpu_layers changes - only update VRAM + shared.gradio['gpu_layers'].change( + partial(update_gpu_layers_and_vram, auto_adjust=False), + gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), + gradio('vram_info'), show_progress=False) if not shared.args.portable: shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False) @@ -282,14 +290,6 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur yield traceback.format_exc().replace('\n', '\n\n') -def estimate_vram_wrapper(model, gpu_layers, ctx_size, cache_type): - if model in ["None", None]: - return "
Estimated VRAM to load the model:" - - result = estimate_vram(model, gpu_layers, ctx_size, cache_type) - return f"
Estimated VRAM to load the model: {result:.0f} MiB" - - def update_truncation_length(current_length, state): if 'loader' in state: if state['loader'].lower().startswith('exllama') or state['loader'] == 'llama.cpp': @@ -298,10 +298,26 @@ def update_truncation_length(current_length, state): return current_length +def get_initial_vram_info(): + if shared.model_name != 'None' and shared.args.loader == 'llama.cpp': + return update_gpu_layers_and_vram( + shared.args.loader, + shared.model_name, + shared.args.gpu_layers, + shared.args.ctx_size, + shared.args.cache_type, + auto_adjust=False, + for_ui=True + ) + + return "
Estimated VRAM to load the model:" + + def handle_load_model_event_initial(model, state): state = apply_model_settings_to_state(model, state) output = ui.apply_interface_values(state) - update_model_parameters(state) + update_model_parameters(state) # This updates the command-line flags + return output + [state] diff --git a/server.py b/server.py index c35d65a8..c22ed1f1 100644 --- a/server.py +++ b/server.py @@ -49,10 +49,9 @@ from modules.extensions import apply_extensions from modules.LoRA import add_lora_to_model from modules.models import load_model, unload_model_if_idle from modules.models_settings import ( - estimate_vram, get_fallback_settings, get_model_metadata, - get_nvidia_free_vram, + update_gpu_layers_and_vram, update_model_parameters ) from modules.shared import do_cmd_flags_warnings @@ -250,15 +249,19 @@ if __name__ == "__main__": model_settings = get_model_metadata(model_name) update_model_parameters(model_settings, initial=True) # hijack the command-line arguments - if 'gpu_layers' not in shared.provided_arguments: - available_vram = get_nvidia_free_vram() - if available_vram > 0: - n_layers = model_settings['gpu_layers'] - tolerance = 906 - while n_layers > 0 and estimate_vram(model_name, n_layers, shared.args.ctx_size, shared.args.cache_type) > available_vram - tolerance: - n_layers -= 1 + # Auto-adjust GPU layers if not provided by user and it's a llama.cpp model + if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings: + vram_usage, adjusted_layers = update_gpu_layers_and_vram( + shared.args.loader, + model_name, + model_settings['gpu_layers'], + shared.args.ctx_size, + shared.args.cache_type, + auto_adjust=True, + for_ui=False + ) - shared.args.gpu_layers = n_layers + shared.args.gpu_layers = adjusted_layers # Load the model shared.model, shared.tokenizer = load_model(model_name) From ee7b3028acaa38399272e78bb05272a420e72f05 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 09:12:36 -0700 Subject: [PATCH 083/164] Always cache GGUF metadata calls --- modules/models_settings.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index 6ea6660c..8ecd2267 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -58,7 +58,7 @@ def get_model_metadata(model): else: model_file = list(path.glob('*.gguf'))[0] - metadata = metadata_gguf.load_metadata(model_file) + metadata = load_gguf_metadata_with_cache(model_file) for k in metadata: if k.endswith('context_length'): @@ -295,8 +295,8 @@ def save_instruction_template(model, template): yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.") -@functools.lru_cache(maxsize=None) -def get_gguf_metadata_cached(model_file): +@functools.lru_cache(maxsize=1) +def load_gguf_metadata_with_cache(model_file): return metadata_gguf.load_metadata(model_file) @@ -320,7 +320,7 @@ def get_model_size_mb(model_file: Path) -> float: def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): model_file = Path(f'{shared.args.model_dir}/{gguf_file}') - metadata = get_gguf_metadata_cached(model_file) + metadata = load_gguf_metadata_with_cache(model_file) size_in_mb = get_model_size_mb(model_file) # Extract values from metadata From 9ec9b1bf837a995af8f203c8d05897510ab77c3d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 09:56:23 -0700 Subject: [PATCH 084/164] Auto-adjust GPU layers after model unload to utilize freed VRAM --- modules/ui_model_menu.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index b6febb50..39c57bf3 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -145,7 +145,9 @@ def create_event_handlers(): partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success( handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False) - shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False) + shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False).then( + partial(update_gpu_layers_and_vram, auto_adjust=True), gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info', 'gpu_layers'), show_progress=False) + shared.gradio['save_model_settings'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False) From 253e85a519219385668aabeabc82633c8e734ff9 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 10:02:30 -0700 Subject: [PATCH 085/164] Only compute VRAM/GPU layers for llama.cpp models --- modules/models_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index 8ecd2267..0eb179d7 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -441,7 +441,7 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, - If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update - If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage """ - if loader != 'llama.cpp' or model in ["None", None]: + if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"): vram_info = "
Estimated VRAM to load the model:" if for_ui: return (vram_info, gr.update()) if auto_adjust else vram_info From 38c50087feb11af41fed7a944ab0d7ef45a3bc44 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 11:55:06 -0700 Subject: [PATCH 086/164] Prevent a crash on systems without an NVIDIA GPU --- modules/models_settings.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index 0eb179d7..3fdf3c84 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -421,11 +421,9 @@ def get_nvidia_free_vram(): return total_free_vram_mib except FileNotFoundError: - raise # nvidia-smi not found (likely no NVIDIA drivers installed) return -1 except Exception: - raise # Handle any other unexpected exceptions return -1 From fc483650b5e8c4933ac20b647cb822cf45856596 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 11:58:17 -0700 Subject: [PATCH 087/164] Set the maximum gpu_layers value automatically when the model is loaded with --model --- modules/ui_model_menu.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 39c57bf3..cd101c4a 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -38,7 +38,7 @@ def create_ui(): gr.Markdown("## Main options") with gr.Row(): with gr.Column(): - shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') + shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') @@ -315,6 +315,14 @@ def get_initial_vram_info(): return "
Estimated VRAM to load the model:" +def get_initial_gpu_layers_max(): + if shared.model_name != 'None' and shared.args.loader == 'llama.cpp': + model_settings = get_model_metadata(shared.model_name) + return model_settings.get('gpu_layers', 256) + + return 256 + + def handle_load_model_event_initial(model, state): state = apply_model_settings_to_state(model, state) output = ui.apply_interface_values(state) From adb975a380b219bbe14bbd7a19c83eaebc15cd55 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 12:52:43 -0700 Subject: [PATCH 088/164] Prevent fractional gpu-layers in the UI --- modules/ui_model_menu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index cd101c4a..59bb6759 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -38,7 +38,7 @@ def create_ui(): gr.Markdown("## Main options") with gr.Row(): with gr.Column(): - shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') + shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') From 470c822f44dce2269dfaa8e3b37989195982b975 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 12:54:39 -0700 Subject: [PATCH 089/164] API: Hide the uvicorn access logs from the terminal --- extensions/openai/script.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions/openai/script.py b/extensions/openai/script.py index 2b4f274f..2c98ee78 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -447,7 +447,7 @@ def run_server(): # Start server logging.getLogger("uvicorn.error").propagate = False - uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile) + uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False) def setup(): From e4d3f4449d75ea1b1f7f3438dbed8c910a970cec Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 13:02:27 -0700 Subject: [PATCH 090/164] API: Fix a regression --- modules/llama_cpp_server.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 3fc7a0cc..d695c74e 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -146,8 +146,9 @@ class LlamaServer: pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) print() - # Make a direct request with streaming enabled using a context manager - with self.session.post(url, json=payload, stream=True) as response: + # Make the generation request + response = self.session.post(url, json=payload, stream=True) + try: response.raise_for_status() # Raise an exception for HTTP errors full_text = "" @@ -184,6 +185,8 @@ class LlamaServer: print(f"JSON decode error: {e}") print(f"Problematic line: {line}") continue + finally: + response.close() def generate(self, prompt, state): output = "" From 1c549d176b27233daf0ef6992bf5b5d8215784f9 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 17:24:06 -0700 Subject: [PATCH 091/164] Fix GPU layers slider: honor saved settings and show true maximum --- modules/models_settings.py | 30 +++++++++++++++++++++--------- modules/ui_model_menu.py | 2 +- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index 3fdf3c84..6715d494 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -72,6 +72,7 @@ def get_model_metadata(model): model_settings['compress_pos_emb'] = metadata[k] elif k.endswith('block_count'): model_settings['gpu_layers'] = metadata[k] + 1 + model_settings['max_gpu_layers'] = metadata[k] + 1 if 'tokenizer.chat_template' in metadata: template = metadata['tokenizer.chat_template'] @@ -450,17 +451,28 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, max_layers = gpu_layers if auto_adjust: - # Get max layers from model metadata + # Get model settings including user preferences model_settings = get_model_metadata(model) - max_layers = model_settings.get('gpu_layers', gpu_layers) - # Auto-adjust based on available VRAM - available_vram = get_nvidia_free_vram() - if available_vram > 0: - tolerance = 906 - current_layers = max_layers - while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance: - current_layers -= 1 + # Check if the value is from user config-user.yaml + user_config = shared.user_config + model_regex = Path(model).name + '$' + has_user_setting = model_regex in user_config and 'gpu_layers' in user_config[model_regex] + + if has_user_setting: + # Just return the current user value without adjustment + max_layers = model_settings.get('max_gpu_layers', 256) + else: + # No user setting, use model's max and auto-adjust + max_layers = model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', gpu_layers)) + current_layers = max_layers # Start from max + + # Auto-adjust based on available VRAM + available_vram = get_nvidia_free_vram() + if available_vram > 0: + tolerance = 906 + while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance: + current_layers -= 1 # Calculate VRAM with current layers vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 59bb6759..5b7dfdd8 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -318,7 +318,7 @@ def get_initial_vram_info(): def get_initial_gpu_layers_max(): if shared.model_name != 'None' and shared.args.loader == 'llama.cpp': model_settings = get_model_metadata(shared.model_name) - return model_settings.get('gpu_layers', 256) + return model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', 256)) return 256 From d99fb0a22a44dc4fb4d695647ba07cbf55e044c6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 17:29:18 -0700 Subject: [PATCH 092/164] Add backward compatibility with saved n_gpu_layers values --- modules/models_settings.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/models_settings.py b/modules/models_settings.py index 6715d494..76bce7a9 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -154,6 +154,9 @@ def get_model_metadata(model): for pat in settings: if re.match(pat.lower(), Path(model).name.lower()): for k in settings[pat]: + if k == 'n_gpu_layers': + k = 'gpu_layers' + model_settings[k] = settings[pat][k] # Load instruction template if defined by name rather than by value From 71fa046c1708a235853c359ef95b363a20c762d3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 17:38:08 -0700 Subject: [PATCH 093/164] Minor changes after 1c549d176b27233daf0ef6992bf5b5d8215784f9 --- modules/models_settings.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index 76bce7a9..3a2400d4 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -457,17 +457,20 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, # Get model settings including user preferences model_settings = get_model_metadata(model) - # Check if the value is from user config-user.yaml + # Get the true maximum layers + max_layers = model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', gpu_layers)) + + # Check if this is a user-saved setting user_config = shared.user_config model_regex = Path(model).name + '$' has_user_setting = model_regex in user_config and 'gpu_layers' in user_config[model_regex] if has_user_setting: - # Just return the current user value without adjustment - max_layers = model_settings.get('max_gpu_layers', 256) + # For user settings, just use the current value (which already has user pref) + # but ensure the slider maximum is correct + current_layers = gpu_layers # Already has user setting else: - # No user setting, use model's max and auto-adjust - max_layers = model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', gpu_layers)) + # No user setting, auto-adjust from the maximum current_layers = max_layers # Start from max # Auto-adjust based on available VRAM From e3bba510d443a0a447f85083a2dff4a116a50848 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 16 May 2025 17:48:54 -0700 Subject: [PATCH 094/164] UI: Only add a blank space to streaming messages in instruct mode --- css/main.css | 2 +- js/main.js | 2 +- modules/html_generator.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/css/main.css b/css/main.css index 0902b184..3fec7bb0 100644 --- a/css/main.css +++ b/css/main.css @@ -390,7 +390,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { margin-left: auto; margin-right: auto; flex: 1; - overflow-y: auto; + overflow-y: hidden; display: flex; flex-direction: column; word-break: break-word; diff --git a/js/main.js b/js/main.js index 205cf88e..6cecd341 100644 --- a/js/main.js +++ b/js/main.js @@ -152,7 +152,7 @@ const observer = new MutationObserver(function(mutations) { } const chatElement = document.getElementById("chat"); - if (chatElement) { + if (chatElement && chatElement.getAttribute("data-mode") === "instruct") { const messagesContainer = chatElement.querySelector(".messages"); const lastChild = messagesContainer?.lastElementChild; const prevSibling = lastChild?.previousElementSibling; diff --git a/modules/html_generator.py b/modules/html_generator.py index 67d15b6e..39659476 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -347,7 +347,7 @@ remove_button = f'' refresh_button = f'' continue_button = f'' remove_button = f'' +info_button = f'' + + +def format_message_timestamp(history, role, index): + """Get a formatted timestamp HTML span for a message if available""" + key = f"{role}_{index}" + if 'metadata' in history and key in history['metadata'] and history['metadata'][key].get('timestamp'): + timestamp = history['metadata'][key]['timestamp'] + return f"{timestamp}" + + return "" def generate_instruct_html(history): @@ -354,6 +363,23 @@ def generate_instruct_html(history): row_internal = history['internal'][i] converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible] + # Get timestamps + user_timestamp = format_message_timestamp(history, "user", i) + assistant_timestamp = format_message_timestamp(history, "assistant", i) + + # Create info buttons for timestamps if they exist + info_message_user = "" + if user_timestamp != "": + # Extract the timestamp value from the span + user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0] + info_message_user = info_button.replace("message", user_timestamp_value) + + info_message_assistant = "" + if assistant_timestamp != "": + # Extract the timestamp value from the span + assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0] + info_message_assistant = info_button.replace("message", assistant_timestamp_value) + if converted_visible[0]: # Don't display empty user messages output += ( f'
' f'
{converted_visible[0]}
' f'{copy_button}' + f'{info_message_user}' f'
' f'
' ) @@ -374,6 +401,7 @@ def generate_instruct_html(history): f'{refresh_button if i == len(history["visible"]) - 1 else ""}' f'{continue_button if i == len(history["visible"]) - 1 else ""}' f'{remove_button if i == len(history["visible"]) - 1 else ""}' + f'{info_message_assistant}' f'
' f'
' ) @@ -401,13 +429,17 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache= row_internal = history['internal'][i] converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible] + # Get timestamps + user_timestamp = format_message_timestamp(history, "user", i) + assistant_timestamp = format_message_timestamp(history, "assistant", i) + if converted_visible[0]: # Don't display empty user messages output += ( f'
' f'
{img_me}
' f'
' - f'
{name1}
' + f'
{name1}{user_timestamp}
' f'
{converted_visible[0]}
' f'{copy_button}' f'
' @@ -419,7 +451,7 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache= f'data-raw="{html.escape(row_internal[1], quote=True)}">' f'
{img_bot}
' f'
' - f'
{name2}
' + f'
{name2}{assistant_timestamp}
' f'
{converted_visible[1]}
' f'{copy_button}' f'{refresh_button if i == len(history["visible"]) - 1 else ""}' @@ -441,6 +473,23 @@ def generate_chat_html(history, name1, name2, reset_cache=False): row_internal = history['internal'][i] converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible] + # Get timestamps + user_timestamp = format_message_timestamp(history, "user", i) + assistant_timestamp = format_message_timestamp(history, "assistant", i) + + # Create info buttons for timestamps if they exist + info_message_user = "" + if user_timestamp != "": + # Extract the timestamp value from the span + user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0] + info_message_user = info_button.replace("message", user_timestamp_value) + + info_message_assistant = "" + if assistant_timestamp != "": + # Extract the timestamp value from the span + assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0] + info_message_assistant = info_button.replace("message", assistant_timestamp_value) + if converted_visible[0]: # Don't display empty user messages output += ( f'
' f'
{converted_visible[0]}
' f'{copy_button}' + f'{info_message_user}' f'
' f'
' ) @@ -461,6 +511,7 @@ def generate_chat_html(history, name1, name2, reset_cache=False): f'{refresh_button if i == len(history["visible"]) - 1 else ""}' f'{continue_button if i == len(history["visible"]) - 1 else ""}' f'{remove_button if i == len(history["visible"]) - 1 else ""}' + f'{info_message_assistant}' f'
' f'
' ) diff --git a/modules/ui_chat.py b/modules/ui_chat.py index a0c37dad..7a5430ca 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -47,7 +47,7 @@ def create_ui(): with gr.Row(): with gr.Column(elem_id='chat-col'): shared.gradio['display'] = gr.JSON(value={}, visible=False) # Hidden buffer - shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True) + shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': [], 'metadata': {}}, '', '', 'chat', 'cai-chat', '')['html'], visible=True) with gr.Row(elem_id="chat-input-row"): with gr.Column(scale=1, elem_id='gr-hover-container'): gr.HTML(value='
', elem_id='gr-hover') From c25a381540eb8c40e945730b058ca3e83fe0674c Mon Sep 17 00:00:00 2001 From: Daniel Dengler Date: Tue, 20 May 2025 16:07:40 +0200 Subject: [PATCH 113/164] Add a "Branch here" footer button to chat messages (#6967) --- css/main.css | 66 +++++++++------------------------------ js/global_scope_js.js | 31 ++++++++++++++++++ modules/chat.py | 10 ++++-- modules/html_generator.py | 47 +++++++++++++++------------- modules/ui.py | 1 + modules/ui_chat.py | 5 +-- 6 files changed, 83 insertions(+), 77 deletions(-) diff --git a/css/main.css b/css/main.css index 319c1778..d7142336 100644 --- a/css/main.css +++ b/css/main.css @@ -1244,67 +1244,31 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { position: relative; } -.footer-button { +/* New container for the buttons */ +.message-actions { position: absolute; + bottom: -23px; + left: 0; + display: flex; + gap: 5px; + opacity: 0; + transition: opacity 0.2s; +} + +.footer-button { padding: 0; margin: 0; border: none; border-radius: 3px; cursor: pointer; - opacity: 0; display: flex; align-items: center; - transition: opacity 0.2s; + justify-content: center; } -.footer-button.footer-copy-button { - bottom: -23px; - left: 0; -} - -.footer-button.footer-refresh-button { - bottom: -23px; - left: 25px; -} - -.footer-button.footer-continue-button { - bottom: -23px; - left: 50px; -} - -.footer-button.footer-remove-button { - bottom: -23px; - left: 75px; -} - -.footer-button.footer-info-button { - bottom: -23px; -} - -.user-message .footer-button.footer-info-button { - left: 25px; -} - -.assistant-message:not(:last-child) .footer-button.footer-info-button { - left: 25px; -} - -.assistant-message:last-child .footer-button.footer-info-button { - left: 100px; -} - -.message:not(:last-child) .text-bot .footer-button.footer-info-button, -.message .text-you .footer-button.footer-info-button { - left: 25px; -} - -.message:last-child .text-bot .footer-button.footer-info-button { - left: 100px; -} - -.message:hover .footer-button, -.user-message:hover .footer-button, -.assistant-message:hover .footer-button { +.message:hover .message-actions, +.user-message:hover .message-actions, +.assistant-message:hover .message-actions { opacity: 1; } diff --git a/js/global_scope_js.js b/js/global_scope_js.js index 29d2d8bd..285d82f9 100644 --- a/js/global_scope_js.js +++ b/js/global_scope_js.js @@ -18,6 +18,37 @@ function copyToClipboard(element) { }); } +function branchHere(element) { + if (!element) return; + + const messageElement = element.closest(".message, .user-message, .assistant-message"); + if (!messageElement) return; + + const index = messageElement.getAttribute("data-index"); + if (!index) return; + + const branchIndexInput = document.getElementById("Branch-index").querySelector("input"); + if (!branchIndexInput) { + console.error("Element with ID 'Branch-index' not found."); + return; + } + const branchButton = document.getElementById("Branch"); + + if (!branchButton) { + console.error("Required element 'Branch' not found."); + return; + } + + branchIndexInput.value = index; + + // Trigger any 'change' or 'input' events Gradio might be listening for + const event = new Event("input", { bubbles: true }); // 'change' might also work + branchIndexInput.dispatchEvent(event); + + branchButton.click(); // Gradio will now pick up the 'index' + +} + function regenerateClick() { document.getElementById("Regenerate").click(); } diff --git a/modules/chat.py b/modules/chat.py index cbcde212..13f733e9 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -1248,7 +1248,13 @@ def handle_delete_chat_confirm_click(state): def handle_branch_chat_click(state): - history = state['history'] + branch_from_index = state['branch_index'] + if branch_from_index == -1: + history = state['history'] + else: + history = state['history'] + history['visible'] = history['visible'][:branch_from_index + 1] + history['internal'] = history['internal'][:branch_from_index + 1] new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S') save_history(history, new_unique_id, state['character_menu'], state['mode']) @@ -1259,7 +1265,7 @@ def handle_branch_chat_click(state): past_chats_update = gr.update(choices=histories, value=new_unique_id) - return [history, html, past_chats_update] + return [history, html, past_chats_update, -1] def handle_rename_chat_click(): diff --git a/modules/html_generator.py b/modules/html_generator.py index 5dbde6da..36b31ac5 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -335,10 +335,12 @@ copy_svg = '''''' continue_svg = '''''' remove_svg = '''''' +branch_svg = '''''' info_svg = '''''' info_svg_small = '''''' copy_button = f'' +branch_button = f'' refresh_button = f'' continue_button = f'' remove_button = f'' @@ -355,6 +357,17 @@ def format_message_timestamp(history, role, index): return "" +def actions_html(history, i, info_message=""): + return (f'
' + f'{copy_button}' + f'{refresh_button if i == len(history["visible"]) - 1 else ""}' + f'{continue_button if i == len(history["visible"]) - 1 else ""}' + f'{remove_button if i == len(history["visible"]) - 1 else ""}' + f'{branch_button}' + f'{info_message}' + f'
') + + def generate_instruct_html(history): output = f'
' @@ -386,22 +399,18 @@ def generate_instruct_html(history): f'data-raw="{html.escape(row_internal[0], quote=True)}">' f'
' f'
{converted_visible[0]}
' - f'{copy_button}' - f'{info_message_user}' + f'
{copy_button}{info_message_user}
' f'
' f'
' ) output += ( f'
' + f'data-raw="{html.escape(row_internal[1], quote=True)}"' + f'data-index={i}>' f'
' f'
{converted_visible[1]}
' - f'{copy_button}' - f'{refresh_button if i == len(history["visible"]) - 1 else ""}' - f'{continue_button if i == len(history["visible"]) - 1 else ""}' - f'{remove_button if i == len(history["visible"]) - 1 else ""}' - f'{info_message_assistant}' + f'{actions_html(history, i, info_message_assistant)}' f'
' f'
' ) @@ -441,22 +450,20 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache= f'
' f'
{name1}{user_timestamp}
' f'
{converted_visible[0]}
' - f'{copy_button}' + f'
{copy_button}
' f'
' f'
' ) output += ( f'
' + f'data-raw="{html.escape(row_internal[1], quote=True)}"' + f'data-index={i}>' f'
{img_bot}
' f'
' f'
{name2}{assistant_timestamp}
' f'
{converted_visible[1]}
' - f'{copy_button}' - f'{refresh_button if i == len(history["visible"]) - 1 else ""}' - f'{continue_button if i == len(history["visible"]) - 1 else ""}' - f'{remove_button if i == len(history["visible"]) - 1 else ""}' + f'{actions_html(history, i)}' f'
' f'
' ) @@ -496,22 +503,18 @@ def generate_chat_html(history, name1, name2, reset_cache=False): f'data-raw="{html.escape(row_internal[0], quote=True)}">' f'
' f'
{converted_visible[0]}
' - f'{copy_button}' - f'{info_message_user}' + f'
{copy_button}{info_message_user}
' f'
' f'
' ) output += ( f'
' + f'data-raw="{html.escape(row_internal[1], quote=True)}"' + f'data-index={i}>' f'
' f'
{converted_visible[1]}
' - f'{copy_button}' - f'{refresh_button if i == len(history["visible"]) - 1 else ""}' - f'{continue_button if i == len(history["visible"]) - 1 else ""}' - f'{remove_button if i == len(history["visible"]) - 1 else ""}' - f'{info_message_assistant}' + f'{actions_html(history, i, info_message_assistant)}' f'
' f'
' ) diff --git a/modules/ui.py b/modules/ui.py index f5dc0632..5e8fa14e 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -210,6 +210,7 @@ def list_interface_input_elements(): 'negative_prompt', 'dry_sequence_breakers', 'grammar_string', + 'branch_index' ] # Chat elements diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 7a5430ca..513a632b 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -24,7 +24,8 @@ def create_ui(): with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']): with gr.Column(): with gr.Row(elem_id='past-chats-buttons'): - shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', interactive=not mu) + shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', elem_id='Branch', interactive=not mu) + shared.gradio['branch_index'] = gr.Number(value=-1, precision=0, visible=False, elem_id="Branch-index", interactive=True) shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu) shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu) shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input']) @@ -258,7 +259,7 @@ def create_event_handlers(): shared.gradio['branch_chat'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False) + chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id', 'branch_index'), show_progress=False) shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename-row'), show_progress=False) shared.gradio['rename_to-cancel'].click(lambda: gr.update(visible=False), None, gradio('rename-row'), show_progress=False) From 616ea6966d4821357076ff0c3b0a37967b736dd1 Mon Sep 17 00:00:00 2001 From: oobabooga Date: Tue, 20 May 2025 12:51:28 -0300 Subject: [PATCH 114/164] Store previous reply versions on regenerate (#7004) --- modules/chat.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/modules/chat.py b/modules/chat.py index 13f733e9..3efc55db 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -365,6 +365,34 @@ def get_stopping_strings(state): return result +def add_message_version(history, row_idx, is_current=True): + """Add the current message as a version in the history metadata""" + if 'metadata' not in history: + history['metadata'] = {} + + if row_idx >= len(history['internal']) or not history['internal'][row_idx][1].strip(): + return # Skip if row doesn't exist or message is empty + + key = f"assistant_{row_idx}" + + # Initialize metadata structures if needed + if key not in history['metadata']: + history['metadata'][key] = {"timestamp": get_current_timestamp()} + if "versions" not in history['metadata'][key]: + history['metadata'][key]["versions"] = [] + + # Add current message as a version + history['metadata'][key]["versions"].append({ + "content": history['internal'][row_idx][1], + "visible_content": history['visible'][row_idx][1], + "timestamp": get_current_timestamp() + }) + + # Update index if this is the current version + if is_current: + history['metadata'][key]["current_version_index"] = len(history['metadata'][key]["versions"]) - 1 + + def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False): history = state['history'] output = copy.deepcopy(history) @@ -405,6 +433,10 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess text, visible_text = output['internal'][-1][0], output['visible'][-1][0] if regenerate: row_idx = len(output['internal']) - 1 + + # Store the existing response as a version before regenerating + add_message_version(output, row_idx, is_current=False) + if loading_message: yield { 'visible': output['visible'][:-1] + [[visible_text, shared.processing_message]], @@ -465,6 +497,11 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess if is_stream: yield output + # Add the newly generated response as a version (only for regeneration) + if regenerate: + row_idx = len(output['internal']) - 1 + add_message_version(output, row_idx, is_current=True) + output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True) yield output From 51c50b265d50a46b345b1b1d4afa55b5c94d5063 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 20 May 2025 11:15:38 -0700 Subject: [PATCH 115/164] Update llama.cpp to https://github.com/ggml-org/llama.cpp/commit/b7a17463ec190aeee7b9077c606c910fb4688b84 --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 16 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 1dcf8c93..c65ab8a2 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 4a1702e9..3da16d3e 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 0caca631..271b4bd0 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 9a439798..15df937c 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 16e77264..bd2f8339 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -29,8 +29,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 468f97fa..98c25649 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index eb7872ed..6e13c1d2 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 3ba42c0b..67a5cb73 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 6831c461..409252f6 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index fbb77ec0..89adbabf 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 71575b28..0b1c03fa 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -15,6 +15,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index d093ab14..eb4319b7 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 064d8e6c..0a60d4de 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 342239e8..652e9900 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 4ef3e97b..c83d61c7 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 7b39feb1..e69f3bdf 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.13.0/llama_cpp_binaries-0.13.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 5d00574a566ac8c66af16f76c9cbda6696e46e00 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 20 May 2025 16:20:49 -0700 Subject: [PATCH 116/164] Minor UI fixes --- modules/models_settings.py | 4 ++-- modules/ui_model_menu.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index e742e0d8..df5a8e8d 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -438,7 +438,7 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, - If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage """ if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"): - vram_info = "
Estimated VRAM to load the model:" + vram_info = "
Estimated VRAM to load the model:
" if for_ui: return (vram_info, gr.update()) if auto_adjust else vram_info else: @@ -480,7 +480,7 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type) if for_ui: - vram_info = f"
Estimated VRAM to load the model: {vram_usage:.0f} MiB" + vram_info = f"
Estimated VRAM to load the model: {vram_usage:.0f} MiB
" if auto_adjust: return vram_info, gr.update(value=current_layers, maximum=max_layers) else: diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index d361f692..862b3893 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -310,7 +310,7 @@ def get_initial_vram_info(): for_ui=True ) - return "
Estimated VRAM to load the model:" + return "
Estimated VRAM to load the model:
" def get_initial_gpu_layers_max(): From 409a48d6bdd0f2bc861fc459cdd701d697bdd188 Mon Sep 17 00:00:00 2001 From: oobabooga Date: Wed, 21 May 2025 00:36:20 -0300 Subject: [PATCH 117/164] Add attachments support (text files, PDF documents) (#7005) --- css/main.css | 56 ++++++++ modules/chat.py | 124 ++++++++++++++++-- modules/html_generator.py | 41 ++++++ modules/ui_chat.py | 6 +- requirements/full/requirements.txt | 1 + requirements/full/requirements_amd.txt | 1 + requirements/full/requirements_amd_noavx2.txt | 1 + .../full/requirements_apple_intel.txt | 1 + .../full/requirements_apple_silicon.txt | 1 + requirements/full/requirements_cpu_only.txt | 1 + .../full/requirements_cpu_only_noavx2.txt | 1 + requirements/full/requirements_noavx2.txt | 1 + requirements/full/requirements_nowheels.txt | 1 + requirements/portable/requirements.txt | 1 + .../portable/requirements_apple_intel.txt | 1 + .../portable/requirements_apple_silicon.txt | 1 + .../portable/requirements_cpu_only.txt | 1 + .../portable/requirements_cpu_only_noavx2.txt | 1 + requirements/portable/requirements_noavx2.txt | 1 + .../portable/requirements_nowheels.txt | 1 + requirements/portable/requirements_vulkan.txt | 1 + .../portable/requirements_vulkan_noavx2.txt | 1 + 22 files changed, 233 insertions(+), 12 deletions(-) diff --git a/css/main.css b/css/main.css index d7142336..6cb99fc3 100644 --- a/css/main.css +++ b/css/main.css @@ -592,6 +592,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { padding: 0.65rem 2.5rem; border: 0; box-shadow: 0; + border-radius: 8px; } #chat-input textarea::placeholder { @@ -611,6 +612,16 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { display: none; } +#chat-input .submit-button { + display: none; +} + +#chat-input .upload-button { + margin-right: 16px; + margin-bottom: 7px; + background: transparent; +} + .chat-input-positioned { max-width: 54rem; left: 50%; @@ -1395,3 +1406,48 @@ strong { .dark #vram-info .value { color: #07ff07; } + +.message-attachments { + display: flex; + flex-wrap: wrap; + gap: 8px; + margin-top: 8px; +} + +.attachment-box { + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + padding: 8px; + background: rgb(0 0 0 / 5%); + border-radius: 6px; + border: 1px solid rgb(0 0 0 / 10%); + min-width: 80px; + max-width: 120px; +} + +.attachment-icon { + margin-bottom: 4px; + color: #555; +} + +.attachment-name { + font-size: 0.8em; + text-align: center; + word-break: break-word; + overflow: hidden; + text-overflow: ellipsis; + display: -webkit-box; + -webkit-line-clamp: 2; + -webkit-box-orient: vertical; +} + +.dark .attachment-box { + background: rgb(255 255 255 / 5%); + border: 1px solid rgb(255 255 255 / 10%); +} + +.dark .attachment-icon { + color: #ccc; +} diff --git a/modules/chat.py b/modules/chat.py index 3efc55db..cdd50c92 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -157,7 +157,9 @@ def generate_chat_prompt(user_input, state, **kwargs): impersonate = kwargs.get('impersonate', False) _continue = kwargs.get('_continue', False) also_return_rows = kwargs.get('also_return_rows', False) - history = kwargs.get('history', state['history'])['internal'] + history_data = kwargs.get('history', state['history']) + history = history_data['internal'] + metadata = history_data.get('metadata', {}) # Templates chat_template_str = state['chat_template_str'] @@ -196,11 +198,13 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.append({"role": "system", "content": context}) insert_pos = len(messages) - for entry in reversed(history): + for i, entry in enumerate(reversed(history)): user_msg = entry[0].strip() assistant_msg = entry[1].strip() tool_msg = entry[2].strip() if len(entry) > 2 else '' + row_idx = len(history) - i - 1 + if tool_msg: messages.insert(insert_pos, {"role": "tool", "content": tool_msg}) @@ -208,10 +212,40 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg}) if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']: - messages.insert(insert_pos, {"role": "user", "content": user_msg}) + # Check for user message attachments in metadata + user_key = f"user_{row_idx}" + enhanced_user_msg = user_msg + + # Add attachment content if present + if user_key in metadata and "attachments" in metadata[user_key]: + attachments_text = "" + for attachment in metadata[user_key]["attachments"]: + filename = attachment.get("name", "file") + content = attachment.get("content", "") + attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + + if attachments_text: + enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:{attachments_text}" + + messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg}) user_input = user_input.strip() if user_input and not impersonate and not _continue: + # For the current user input being processed, check if we need to add attachments + if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0: + current_row_idx = len(history) + user_key = f"user_{current_row_idx}" + + if user_key in metadata and "attachments" in metadata[user_key]: + attachments_text = "" + for attachment in metadata[user_key]["attachments"]: + filename = attachment.get("name", "file") + content = attachment.get("content", "") + attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + + if attachments_text: + user_input = f"{user_input}\n\nATTACHMENTS:{attachments_text}" + messages.append({"role": "user", "content": user_input}) def make_prompt(messages): @@ -280,7 +314,6 @@ def generate_chat_prompt(user_input, state, **kwargs): # Resort to truncating the user input else: - user_message = messages[-1]['content'] # Bisect the truncation point @@ -393,7 +426,74 @@ def add_message_version(history, row_idx, is_current=True): history['metadata'][key]["current_version_index"] = len(history['metadata'][key]["versions"]) - 1 +def add_message_attachment(history, row_idx, file_path, is_user=True): + """Add a file attachment to a message in history metadata""" + if 'metadata' not in history: + history['metadata'] = {} + + key = f"{'user' if is_user else 'assistant'}_{row_idx}" + + if key not in history['metadata']: + history['metadata'][key] = {"timestamp": get_current_timestamp()} + if "attachments" not in history['metadata'][key]: + history['metadata'][key]["attachments"] = [] + + # Get file info using pathlib + path = Path(file_path) + filename = path.name + file_extension = path.suffix.lower() + + try: + # Handle different file types + if file_extension == '.pdf': + # Process PDF file + content = extract_pdf_text(path) + file_type = "application/pdf" + else: + # Default handling for text files + with open(path, 'r', encoding='utf-8') as f: + content = f.read() + file_type = "text/plain" + + # Add attachment + attachment = { + "name": filename, + "type": file_type, + "content": content, + } + + history['metadata'][key]["attachments"].append(attachment) + return content # Return the content for reuse + except Exception as e: + logger.error(f"Error processing attachment {filename}: {e}") + return None + + +def extract_pdf_text(pdf_path): + """Extract text from a PDF file""" + import PyPDF2 + + text = "" + try: + with open(pdf_path, 'rb') as file: + pdf_reader = PyPDF2.PdfReader(file) + for page_num in range(len(pdf_reader.pages)): + page = pdf_reader.pages[page_num] + text += page.extract_text() + "\n\n" + + return text.strip() + except Exception as e: + logger.error(f"Error extracting text from PDF: {e}") + return f"[Error extracting PDF text: {str(e)}]" + + def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False): + # Handle dict format with text and files + files = [] + if isinstance(text, dict): + files = text.get('files', []) + text = text.get('text', '') + history = state['history'] output = copy.deepcopy(history) output = apply_extensions('history', output) @@ -411,12 +511,18 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess if not (regenerate or _continue): visible_text = html.escape(text) + # Process file attachments and store in metadata + row_idx = len(output['internal']) + + # Add attachments to metadata only, not modifying the message text + for file_path in files: + add_message_attachment(output, row_idx, file_path, is_user=True) + # Apply extensions text, visible_text = apply_extensions('chat_input', text, visible_text, state) text = apply_extensions('input', text, state, is_chat=True) # Current row index - row_idx = len(output['internal']) output['internal'].append([text, '']) output['visible'].append([visible_text, '']) # Add metadata with timestamp @@ -1215,7 +1321,7 @@ def handle_replace_last_reply_click(text, state): save_history(history, state['unique_id'], state['character_menu'], state['mode']) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html, ""] + return [history, html, {"text": "", "files": []}] def handle_send_dummy_message_click(text, state): @@ -1223,7 +1329,7 @@ def handle_send_dummy_message_click(text, state): save_history(history, state['unique_id'], state['character_menu'], state['mode']) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html, ""] + return [history, html, {"text": "", "files": []}] def handle_send_dummy_reply_click(text, state): @@ -1231,7 +1337,7 @@ def handle_send_dummy_reply_click(text, state): save_history(history, state['unique_id'], state['character_menu'], state['mode']) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html, ""] + return [history, html, {"text": "", "files": []}] def handle_remove_last_click(state): @@ -1239,7 +1345,7 @@ def handle_remove_last_click(state): save_history(history, state['unique_id'], state['character_menu'], state['mode']) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html, last_input] + return [history, html, {"text": last_input, "files": []}] def handle_unique_id_select(state): diff --git a/modules/html_generator.py b/modules/html_generator.py index 36b31ac5..f5e0b28f 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -338,6 +338,7 @@ remove_svg = '''''' info_svg = '''''' info_svg_small = '''''' +attachment_svg = '''''' copy_button = f'' branch_button = f'' @@ -357,6 +358,28 @@ def format_message_timestamp(history, role, index): return "" +def format_message_attachments(history, role, index): + """Get formatted HTML for message attachments if available""" + key = f"{role}_{index}" + if 'metadata' in history and key in history['metadata'] and 'attachments' in history['metadata'][key]: + attachments = history['metadata'][key]['attachments'] + if not attachments: + return "" + + attachments_html = '
' + for attachment in attachments: + attachments_html += ( + f'
' + f'
{attachment_svg}
' + f'
{html.escape(attachment["name"])}
' + f'
' + ) + attachments_html += '
' + return attachments_html + + return "" + + def actions_html(history, i, info_message=""): return (f'
' f'{copy_button}' @@ -380,6 +403,10 @@ def generate_instruct_html(history): user_timestamp = format_message_timestamp(history, "user", i) assistant_timestamp = format_message_timestamp(history, "assistant", i) + # Get attachments + user_attachments = format_message_attachments(history, "user", i) + assistant_attachments = format_message_attachments(history, "assistant", i) + # Create info buttons for timestamps if they exist info_message_user = "" if user_timestamp != "": @@ -399,6 +426,7 @@ def generate_instruct_html(history): f'data-raw="{html.escape(row_internal[0], quote=True)}">' f'
' f'
{converted_visible[0]}
' + f'{user_attachments}' f'
{copy_button}{info_message_user}
' f'
' f'
' @@ -410,6 +438,7 @@ def generate_instruct_html(history): f'data-index={i}>' f'
' f'
{converted_visible[1]}
' + f'{assistant_attachments}' f'{actions_html(history, i, info_message_assistant)}' f'
' f'
' @@ -442,6 +471,10 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache= user_timestamp = format_message_timestamp(history, "user", i) assistant_timestamp = format_message_timestamp(history, "assistant", i) + # Get attachments + user_attachments = format_message_attachments(history, "user", i) + assistant_attachments = format_message_attachments(history, "assistant", i) + if converted_visible[0]: # Don't display empty user messages output += ( f'
' f'
{name1}{user_timestamp}
' f'
{converted_visible[0]}
' + f'{user_attachments}' f'
{copy_button}
' f'
' f'
' @@ -463,6 +497,7 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache= f'
' f'
{name2}{assistant_timestamp}
' f'
{converted_visible[1]}
' + f'{assistant_attachments}' f'{actions_html(history, i)}' f'
' f'
' @@ -484,6 +519,10 @@ def generate_chat_html(history, name1, name2, reset_cache=False): user_timestamp = format_message_timestamp(history, "user", i) assistant_timestamp = format_message_timestamp(history, "assistant", i) + # Get attachments + user_attachments = format_message_attachments(history, "user", i) + assistant_attachments = format_message_attachments(history, "assistant", i) + # Create info buttons for timestamps if they exist info_message_user = "" if user_timestamp != "": @@ -503,6 +542,7 @@ def generate_chat_html(history, name1, name2, reset_cache=False): f'data-raw="{html.escape(row_internal[0], quote=True)}">' f'
' f'
{converted_visible[0]}
' + f'{user_attachments}' f'
{copy_button}{info_message_user}
' f'
' f'
' @@ -514,6 +554,7 @@ def generate_chat_html(history, name1, name2, reset_cache=False): f'data-index={i}>' f'
' f'
{converted_visible[1]}
' + f'{assistant_attachments}' f'{actions_html(history, i, info_message_assistant)}' f'
' f'
' diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 513a632b..f244113c 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -54,7 +54,7 @@ def create_ui(): gr.HTML(value='
', elem_id='gr-hover') with gr.Column(scale=10, elem_id='chat-input-container'): - shared.gradio['textbox'] = gr.Textbox(label='', placeholder='Send a message', elem_id='chat-input', elem_classes=['add_scrollbar']) + shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], elem_id='chat-input', elem_classes=['add_scrollbar']) shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls') shared.gradio['typing-dots'] = gr.HTML(value='
', label='typing', elem_id='typing-container') @@ -186,7 +186,7 @@ def create_event_handlers(): shared.gradio['Generate'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( + lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then( chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then( None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then( @@ -194,7 +194,7 @@ def create_event_handlers(): shared.gradio['textbox'].submit( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( + lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then( chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then( None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then( diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index c65ab8a2..afb5f9d4 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -13,6 +13,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 3da16d3e..46c33034 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 271b4bd0..c8e94cbd 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 15df937c..dc403ae2 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index bd2f8339..5c643c4c 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 98c25649..ccabea84 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 6e13c1d2..7e9da47f 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 67a5cb73..fdf5cd0e 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -13,6 +13,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 2e631bf0..22d39ded 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 409252f6..ec9bafc6 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 89adbabf..025a737e 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 0b1c03fa..32644e87 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index eb4319b7..bd5c1d9b 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 0a60d4de..51f2b7d9 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 652e9900..aad6bf5a 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt index 6f9566ba..4c055426 100644 --- a/requirements/portable/requirements_nowheels.txt +++ b/requirements/portable/requirements_nowheels.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index c83d61c7..3d98d1b0 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index e69f3bdf..f954b8d2 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich From cc8a4fdcb114bfd068c42cea267e34daaf901a30 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 20 May 2025 21:31:18 -0700 Subject: [PATCH 118/164] Minor improvement to attachments prompt format --- modules/chat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index cdd50c92..715f4327 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -225,7 +225,7 @@ def generate_chat_prompt(user_input, state, **kwargs): attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" if attachments_text: - enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:{attachments_text}" + enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}" messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg}) @@ -244,7 +244,7 @@ def generate_chat_prompt(user_input, state, **kwargs): attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" if attachments_text: - user_input = f"{user_input}\n\nATTACHMENTS:{attachments_text}" + user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}" messages.append({"role": "user", "content": user_input}) From 8620d6ffe73048932594494752f82cc4a20f8f92 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 20 May 2025 21:34:07 -0700 Subject: [PATCH 119/164] Make it possible to upload multiple text files/pdfs at once --- modules/ui_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_chat.py b/modules/ui_chat.py index f244113c..ab4b4e60 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -54,7 +54,7 @@ def create_ui(): gr.HTML(value='
', elem_id='gr-hover') with gr.Column(scale=10, elem_id='chat-input-container'): - shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], elem_id='chat-input', elem_classes=['add_scrollbar']) + shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar']) shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls') shared.gradio['typing-dots'] = gr.HTML(value='
', label='typing', elem_id='typing-container') From 0d3f85477897c2999f456713ce998b59b26a6a22 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 20 May 2025 21:40:42 -0700 Subject: [PATCH 120/164] Improve the style of thinking blocks --- css/main.css | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/css/main.css b/css/main.css index 6cb99fc3..8444cae8 100644 --- a/css/main.css +++ b/css/main.css @@ -1370,6 +1370,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { contain: layout; } +.chat .message-body .thinking-content p, +.chat .message-body .thinking-content li { + font-size: 14px !important; +} + /* Animation for opening thinking blocks */ @keyframes fadeIn { from { opacity: 0; } From 7f6579ab20d8fd215e81f3b766f3aa9d83066bdb Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 20 May 2025 21:49:44 -0700 Subject: [PATCH 121/164] Minor style change --- css/main.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/css/main.css b/css/main.css index 8444cae8..d1be8eb1 100644 --- a/css/main.css +++ b/css/main.css @@ -1372,7 +1372,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { .chat .message-body .thinking-content p, .chat .message-body .thinking-content li { - font-size: 14px !important; + font-size: 15px !important; } /* Animation for opening thinking blocks */ From bae1aa34aa020aa749f942708b96e28e2b85c4a4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 25 May 2025 17:19:26 -0700 Subject: [PATCH 122/164] Fix loading `Llama-3_3-Nemotron-Super-49B-v1` and similar models (closes #7012) --- modules/models_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/models_settings.py b/modules/models_settings.py index df5a8e8d..c914bdea 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -335,7 +335,7 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): if key.endswith('.block_count'): n_layers = value elif key.endswith('.attention.head_count_kv'): - n_kv_heads = value + n_kv_heads = max(value) if isinstance(value, list) else value elif key.endswith('.embedding_length'): embedding_dim = value From 73bfc936a078ce428cc10b590a83e0391b6aed58 Mon Sep 17 00:00:00 2001 From: djholtby Date: Mon, 26 May 2025 21:39:03 -0400 Subject: [PATCH 123/164] Close response generator when stopping API generation (#7014) --- extensions/openai/script.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/extensions/openai/script.py b/extensions/openai/script.py index b6abae20..24bcd69d 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -125,6 +125,7 @@ async def openai_completions(request: Request, request_data: CompletionRequest): yield {"data": json.dumps(resp)} finally: stop_everything_event() + response.close() return return EventSourceResponse(generator()) # SSE streaming @@ -157,6 +158,7 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion yield {"data": json.dumps(resp)} finally: stop_everything_event() + response.close() return return EventSourceResponse(generator()) # SSE streaming From 8531100109ecc4a5bed41cc2f3adaddf9d7157f8 Mon Sep 17 00:00:00 2001 From: Underscore <47636331+Th-Underscore@users.noreply.github.com> Date: Mon, 26 May 2025 21:40:09 -0400 Subject: [PATCH 124/164] Fix textbox text usage in methods (#7009) --- modules/chat.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 715f4327..36a07836 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -708,8 +708,9 @@ def send_last_reply_to_input(history): return '' -def replace_last_reply(text, state): +def replace_last_reply(textbox, state): history = state['history'] + text = textbox['text'] # Initialize metadata if not present if 'metadata' not in history: @@ -726,8 +727,9 @@ def replace_last_reply(text, state): return history -def send_dummy_message(text, state): +def send_dummy_message(textbox, state): history = state['history'] + text = textbox['text'] # Initialize metadata if not present if 'metadata' not in history: @@ -741,8 +743,9 @@ def send_dummy_message(text, state): return history -def send_dummy_reply(text, state): +def send_dummy_reply(textbox, state): history = state['history'] + text = textbox['text'] # Initialize metadata if not present if 'metadata' not in history: From cc9b7253c1216e5340da85cba9b65a13cf3526e9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 May 2025 23:13:10 -0300 Subject: [PATCH 125/164] Update transformers requirement in /requirements/full (#7017) --- requirements/full/requirements.txt | 2 +- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_amd_noavx2.txt | 2 +- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cpu_only.txt | 2 +- requirements/full/requirements_cpu_only_noavx2.txt | 2 +- requirements/full/requirements_noavx2.txt | 2 +- requirements/full/requirements_nowheels.txt | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index afb5f9d4..3d18f5fd 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -21,7 +21,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.52.* tqdm wandb diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 46c33034..82b19964 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -20,7 +20,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.52.* tqdm wandb diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index c8e94cbd..a8b03014 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -20,7 +20,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.52.* tqdm wandb diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index dc403ae2..5a61ac7d 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -20,7 +20,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.52.* tqdm wandb diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 5c643c4c..6862c3b4 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -20,7 +20,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.52.* tqdm wandb diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index ccabea84..e6982779 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -20,7 +20,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.52.* tqdm wandb diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 7e9da47f..97bff786 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -20,7 +20,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.52.* tqdm wandb diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index fdf5cd0e..17c7e246 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -21,7 +21,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.52.* tqdm wandb diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 22d39ded..89b32caf 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -20,7 +20,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.50.* +transformers==4.52.* tqdm wandb From 355b5f6c8b5552ccdae1aa363931724306bdbb16 Mon Sep 17 00:00:00 2001 From: Underscore <47636331+Th-Underscore@users.noreply.github.com> Date: Tue, 27 May 2025 21:54:18 -0400 Subject: [PATCH 126/164] UI: Add message version navigation (#6947) --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> --- css/main.css | 41 ++++++++++++++++- js/global_scope_js.js | 38 ++++++++++++++++ js/main.js | 93 ++++++++++++++++++++++++++++++++++++++- modules/chat.py | 59 +++++++++++++++++++++++-- modules/html_generator.py | 27 +++++++++++- modules/ui.py | 2 + modules/ui_chat.py | 10 +++++ 7 files changed, 262 insertions(+), 8 deletions(-) diff --git a/css/main.css b/css/main.css index d1be8eb1..be27544c 100644 --- a/css/main.css +++ b/css/main.css @@ -1260,7 +1260,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { position: absolute; bottom: -23px; left: 0; - display: flex; + display: flex; gap: 5px; opacity: 0; transition: opacity 0.2s; @@ -1456,3 +1456,42 @@ strong { .dark .attachment-icon { color: #ccc; } + +/* --- Simple Version Navigation --- */ +.version-navigation { + position: absolute; + bottom: -23px; + right: 0; + display: flex; + align-items: center; + gap: 5px; + opacity: 0; + transition: opacity 0.2s; +} + +.message:hover .version-navigation, +.user-message:hover .version-navigation, +.assistant-message:hover .version-navigation { + opacity: 1; +} + +.version-nav-button { + padding: 2px 6px; + font-size: 12px; + min-width: auto; +} + +.version-nav-button[disabled] { + opacity: 0.3; + cursor: not-allowed; +} + +.version-position { + font-size: 11px; + color: currentColor; + font-family: monospace; + min-width: 35px; + text-align: center; + opacity: 0.8; + user-select: none; +} diff --git a/js/global_scope_js.js b/js/global_scope_js.js index 285d82f9..9174622e 100644 --- a/js/global_scope_js.js +++ b/js/global_scope_js.js @@ -49,6 +49,44 @@ function branchHere(element) { } +function navigateVersion(element, direction) { + if (!element) return; + + const messageElement = element.closest(".message, .user-message, .assistant-message"); + if (!messageElement) return; + + const index = messageElement.getAttribute("data-index"); + if (!index) return; + + const indexInput = document.getElementById("Navigate-message-index").querySelector("input"); + if (!indexInput) { + console.error("Element with ID 'Navigate-message-index' not found."); + return; + } + + const directionInput = document.getElementById("Navigate-direction").querySelector("textarea"); + if (!directionInput) { + console.error("Element with ID 'Navigate-direction' not found."); + return; + } + + const navigateButton = document.getElementById("Navigate-version"); + if (!navigateButton) { + console.error("Required element 'Navigate-version' not found."); + return; + } + + indexInput.value = index; + directionInput.value = direction; + + // Trigger any 'change' or 'input' events Gradio might be listening for + const event = new Event("input", { bubbles: true }); + indexInput.dispatchEvent(event); + directionInput.dispatchEvent(event); + + navigateButton.click(); +} + function regenerateClick() { document.getElementById("Regenerate").click(); } diff --git a/js/main.js b/js/main.js index 01c346a7..d90e8ade 100644 --- a/js/main.js +++ b/js/main.js @@ -39,9 +39,24 @@ document.querySelector(".header_bar").addEventListener("click", function(event) //------------------------------------------------ // Keyboard shortcuts //------------------------------------------------ + +// --- Helper functions --- // +function isModifiedKeyboardEvent() { + return (event instanceof KeyboardEvent && + event.shiftKey || + event.ctrlKey || + event.altKey || + event.metaKey); +} + +function isFocusedOnEditableTextbox() { + if (event.target.tagName === "INPUT" || event.target.tagName === "TEXTAREA") { + return !!event.target.value; + } +} + let previousTabId = "chat-tab-button"; document.addEventListener("keydown", function(event) { - // Stop generation on Esc pressed if (event.key === "Escape") { // Find the element with id 'stop' and click it @@ -49,10 +64,15 @@ document.addEventListener("keydown", function(event) { if (stopButton) { stopButton.click(); } + return; + } + + if (!document.querySelector("#chat-tab").checkVisibility() ) { + return; } // Show chat controls on Ctrl + S - else if (event.ctrlKey && event.key == "s") { + if (event.ctrlKey && event.key == "s") { event.preventDefault(); var showControlsElement = document.getElementById("show-controls"); @@ -100,6 +120,23 @@ document.addEventListener("keydown", function(event) { document.getElementById("Impersonate").click(); } + // --- Simple version navigation --- // + if (!isFocusedOnEditableTextbox()) { + // Version navigation on Arrow keys (horizontal) + if (!isModifiedKeyboardEvent() && event.key === "ArrowLeft") { + event.preventDefault(); + navigateLastAssistantMessage("left"); + } + + else if (!isModifiedKeyboardEvent() && event.key === "ArrowRight") { + event.preventDefault(); + if (!navigateLastAssistantMessage("right")) { + // If can't navigate right (last version), regenerate + document.getElementById("Regenerate").click(); + } + } + } + }); //------------------------------------------------ @@ -789,3 +826,55 @@ function createMobileTopBar() { } createMobileTopBar(); + +//------------------------------------------------ +// Simple Navigation Functions +//------------------------------------------------ + +function navigateLastAssistantMessage(direction) { + const chat = document.querySelector("#chat"); + if (!chat) return false; + + const messages = chat.querySelectorAll("[data-index]"); + if (messages.length === 0) return false; + + // Find the last assistant message (starting from the end) + let lastAssistantMessage = null; + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if ( + msg.classList.contains("assistant-message") || + msg.querySelector(".circle-bot") || + msg.querySelector(".text-bot") + ) { + lastAssistantMessage = msg; + break; + } + } + + if (!lastAssistantMessage) return false; + + const buttons = lastAssistantMessage.querySelectorAll(".version-nav-button"); + + for (let i = 0; i < buttons.length; i++) { + const button = buttons[i]; + const onclick = button.getAttribute("onclick"); + const disabled = button.hasAttribute("disabled"); + + const isLeft = onclick && onclick.includes("'left'"); + const isRight = onclick && onclick.includes("'right'"); + + if (!disabled) { + if (direction === "left" && isLeft) { + navigateVersion(button, direction); + return true; + } + if (direction === "right" && isRight) { + navigateVersion(button, direction); + return true; + } + } + } + + return false; +} diff --git a/modules/chat.py b/modules/chat.py index 36a07836..6eed47ee 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -414,10 +414,20 @@ def add_message_version(history, row_idx, is_current=True): if "versions" not in history['metadata'][key]: history['metadata'][key]["versions"] = [] + # Check if this version already exists + current_content = history['internal'][row_idx][1] + current_visible = history['visible'][row_idx][1] + + for i, version in enumerate(history['metadata'][key]["versions"]): + if version['content'] == current_content and version['visible_content'] == current_visible: + if is_current: + history['metadata'][key]["current_version_index"] = i + return + # Add current message as a version history['metadata'][key]["versions"].append({ - "content": history['internal'][row_idx][1], - "visible_content": history['visible'][row_idx][1], + "content": current_content, + "visible_content": current_visible, "timestamp": get_current_timestamp() }) @@ -540,8 +550,9 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess if regenerate: row_idx = len(output['internal']) - 1 - # Store the existing response as a version before regenerating - add_message_version(output, row_idx, is_current=False) + # Store the first response as a version before regenerating + if not output['metadata'].get(f"assistant_{row_idx}", {}).get('versions'): + add_message_version(output, row_idx, is_current=False) if loading_message: yield { @@ -1414,6 +1425,46 @@ def handle_branch_chat_click(state): return [history, html, past_chats_update, -1] +def handle_navigate_version_click(state): + history = state['history'] + message_index = int(state['navigate_message_index']) + direction = state['navigate_direction'] + + # Get assistant message metadata + key = f"assistant_{message_index}" + if key not in history['metadata'] or 'versions' not in history['metadata'][key]: + # No versions to navigate + html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + return [history, html] + + metadata = history['metadata'][key] + current_idx = metadata.get('current_version_index', 0) + versions = metadata['versions'] + + # Calculate new index + if direction == 'left': + new_idx = max(0, current_idx - 1) + else: # right + new_idx = min(len(versions) - 1, current_idx + 1) + + if new_idx == current_idx: + # No change needed + html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + return [history, html] + + # Update history with new version + version = versions[new_idx] + history['internal'][message_index][1] = version['content'] + history['visible'][message_index][1] = version['visible_content'] + metadata['current_version_index'] = new_idx + + # Redraw and save + html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + save_history(history, state['unique_id'], state['character_menu'], state['mode']) + + return [history, html] + + def handle_rename_chat_click(): return [ gr.update(value="My New Chat"), diff --git a/modules/html_generator.py b/modules/html_generator.py index f5e0b28f..1dfeb445 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -380,6 +380,30 @@ def format_message_attachments(history, role, index): return "" +def get_version_navigation_html(history, i): + """Generate simple navigation arrows for message versions""" + key = f"assistant_{i}" + metadata = history.get('metadata', {}) + + if key not in metadata or 'versions' not in metadata[key]: + return "" + + versions = metadata[key]['versions'] + current_idx = metadata[key].get('current_version_index', 0) + + if len(versions) <= 1: + return "" + + left_disabled = ' disabled' if current_idx == 0 else '' + right_disabled = ' disabled' if current_idx >= len(versions) - 1 else '' + + left_arrow = f'' + right_arrow = f'' + position = f'{current_idx + 1}/{len(versions)}' + + return f'
{left_arrow}{position}{right_arrow}
' + + def actions_html(history, i, info_message=""): return (f'
' f'{copy_button}' @@ -388,7 +412,8 @@ def actions_html(history, i, info_message=""): f'{remove_button if i == len(history["visible"]) - 1 else ""}' f'{branch_button}' f'{info_message}' - f'
') + f'
' + f'{get_version_navigation_html(history, i)}') def generate_instruct_html(history): diff --git a/modules/ui.py b/modules/ui.py index 5e8fa14e..52c095a2 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -157,6 +157,8 @@ def list_model_elements(): def list_interface_input_elements(): elements = [ + 'navigate_message_index', + 'navigate_direction', 'temperature', 'dynatemp_low', 'dynatemp_high', diff --git a/modules/ui_chat.py b/modules/ui_chat.py index ab4b4e60..7a9f6f76 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -97,6 +97,12 @@ def create_ui(): with gr.Row(): shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar']) + # Hidden elements for version navigation (similar to branch) + with gr.Row(visible=False): + shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index") + shared.gradio['navigate_direction'] = gr.Textbox(value="", elem_id="Navigate-direction") + shared.gradio['navigate_version'] = gr.Button(elem_id="Navigate-version") + def create_chat_settings_ui(): mu = shared.args.multi_user @@ -293,6 +299,10 @@ def create_event_handlers(): shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'), show_progress=False) shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False) + shared.gradio['navigate_version'].click( + ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( + chat.handle_navigate_version_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False) + # Save/delete a character shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False) shared.gradio['delete_character'].click(lambda: gr.update(visible=True), None, gradio('character_deleter'), show_progress=False) From 5028480ebabf26ec44778588b4fbd019cd9456ed Mon Sep 17 00:00:00 2001 From: Underscore <47636331+Th-Underscore@users.noreply.github.com> Date: Tue, 27 May 2025 23:55:27 -0400 Subject: [PATCH 127/164] UI: Add footer buttons for editing messages (#7019) --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> --- css/main.css | 49 +++++++++++++- js/global_scope_js.js | 132 +++++++++++++++++++++++++++++++++++++- js/main.js | 26 ++++---- modules/chat.py | 80 +++++++++++++---------- modules/html_generator.py | 44 ++++++++----- modules/ui.py | 7 +- modules/ui_chat.py | 18 +++--- 7 files changed, 282 insertions(+), 74 deletions(-) diff --git a/css/main.css b/css/main.css index be27544c..9d68ba02 100644 --- a/css/main.css +++ b/css/main.css @@ -1457,6 +1457,53 @@ strong { color: #ccc; } +/* Message Editing Styles */ +.editing-textarea { + width: 100%; + min-height: 200px; + padding: 10px; + border-radius: 5px; + border: 1px solid #ccc; + background-color: var(--light-theme-gray); + font-family: inherit; + font-size: inherit; + resize: vertical; +} + +.dark .editing-textarea { + border: 1px solid var(--border-color-dark); + background-color: var(--darker-gray); +} + +.editing-textarea:focus { + outline: none; + border-color: var(--selected-item-color-dark); +} + +.edit-controls-container { + margin-top: 0; + display: flex; + gap: 8px; + padding-bottom: 8px; +} + +.edit-control-button { + padding: 6px 12px; + border: 1px solid #ccc; + border-radius: 4px; + cursor: pointer; + background-color: #f8f9fa; + color: #212529; + font-size: 12px; + margin: 0; +} + +.dark .edit-control-button { + border: 1px solid var(--border-color-dark); + background-color: var(--light-gray); + color: #efefef; +} + /* --- Simple Version Navigation --- */ .version-navigation { position: absolute; @@ -1488,7 +1535,7 @@ strong { .version-position { font-size: 11px; - color: currentColor; + color: currentcolor; font-family: monospace; min-width: 35px; text-align: center; diff --git a/js/global_scope_js.js b/js/global_scope_js.js index 9174622e..0e86d450 100644 --- a/js/global_scope_js.js +++ b/js/global_scope_js.js @@ -1,3 +1,7 @@ +// ------------------------------------------------- +// Event handlers +// ------------------------------------------------- + function copyToClipboard(element) { if (!element) return; @@ -42,11 +46,135 @@ function branchHere(element) { branchIndexInput.value = index; // Trigger any 'change' or 'input' events Gradio might be listening for - const event = new Event("input", { bubbles: true }); // 'change' might also work + const event = new Event("input", { bubbles: true }); branchIndexInput.dispatchEvent(event); - branchButton.click(); // Gradio will now pick up the 'index' + branchButton.click(); +} +// ------------------------------------------------- +// Message Editing Functions +// ------------------------------------------------- + +function editHere(buttonElement) { + if (!buttonElement) return; + + const messageElement = buttonElement.closest(".message, .user-message, .assistant-message"); + if (!messageElement) return; + + const messageBody = messageElement.querySelector(".message-body"); + if (!messageBody) return; + + // If already editing, focus the textarea + const existingTextarea = messageBody.querySelector(".editing-textarea"); + if (existingTextarea) { + existingTextarea.focus(); + return; + } + + // Determine role based on message element - handle different chat modes + const isUserMessage = messageElement.classList.contains("user-message") || + messageElement.querySelector(".text-you") !== null || + messageElement.querySelector(".circle-you") !== null; + + startEditing(messageElement, messageBody, isUserMessage); +} + +function startEditing(messageElement, messageBody, isUserMessage) { + const rawText = messageElement.getAttribute("data-raw") || messageBody.textContent; + const originalHTML = messageBody.innerHTML; + + // Create editing interface + const editingInterface = createEditingInterface(rawText); + + // Replace message content + messageBody.innerHTML = ""; + messageBody.appendChild(editingInterface.textarea); + messageBody.appendChild(editingInterface.controls); + + editingInterface.textarea.focus(); + editingInterface.textarea.setSelectionRange(rawText.length, rawText.length); + + // Setup event handlers + setupEditingHandlers(editingInterface.textarea, messageElement, originalHTML, messageBody, isUserMessage); +} + +function createEditingInterface(text) { + const textarea = document.createElement("textarea"); + textarea.value = text; + textarea.className = "editing-textarea"; + textarea.rows = Math.max(3, text.split("\n").length); + + const controls = document.createElement("div"); + controls.className = "edit-controls-container"; + + const saveButton = document.createElement("button"); + saveButton.textContent = "Save"; + saveButton.className = "edit-control-button"; + saveButton.type = "button"; + + const cancelButton = document.createElement("button"); + cancelButton.textContent = "Cancel"; + cancelButton.className = "edit-control-button edit-cancel-button"; + cancelButton.type = "button"; + + controls.appendChild(saveButton); + controls.appendChild(cancelButton); + + return { textarea, controls, saveButton, cancelButton }; +} + +function setupEditingHandlers(textarea, messageElement, originalHTML, messageBody, isUserMessage) { + const saveButton = messageBody.querySelector(".edit-control-button:not(.edit-cancel-button)"); + const cancelButton = messageBody.querySelector(".edit-cancel-button"); + + const submitEdit = () => { + const index = messageElement.getAttribute("data-index"); + if (!index || !submitMessageEdit(index, textarea.value, isUserMessage)) { + cancelEdit(); + } + }; + + const cancelEdit = () => { + messageBody.innerHTML = originalHTML; + }; + + // Event handlers + saveButton.onclick = submitEdit; + cancelButton.onclick = cancelEdit; + + textarea.onkeydown = (e) => { + if (e.key === "Enter" && !e.shiftKey) { + e.preventDefault(); + submitEdit(); + } else if (e.key === "Escape") { + e.preventDefault(); + cancelEdit(); + } + }; +} + +function submitMessageEdit(index, newText, isUserMessage) { + const editIndexInput = document.getElementById("Edit-message-index")?.querySelector("input"); + const editTextInput = document.getElementById("Edit-message-text")?.querySelector("textarea"); + const editRoleInput = document.getElementById("Edit-message-role")?.querySelector("textarea"); + const editButton = document.getElementById("Edit-message"); + + if (!editIndexInput || !editTextInput || !editRoleInput || !editButton) { + console.error("Edit elements not found"); + return false; + } + + editIndexInput.value = index; + editTextInput.value = newText; + editRoleInput.value = isUserMessage ? "user" : "assistant"; + + editIndexInput.dispatchEvent(new Event("input", { bubbles: true })); + editTextInput.dispatchEvent(new Event("input", { bubbles: true })); + editRoleInput.dispatchEvent(new Event("input", { bubbles: true })); + + editButton.click(); + return true; } function navigateVersion(element, direction) { diff --git a/js/main.js b/js/main.js index d90e8ade..fc014f66 100644 --- a/js/main.js +++ b/js/main.js @@ -1,3 +1,7 @@ +// ------------------------------------------------ +// Main +// ------------------------------------------------ + let main_parent = document.getElementById("chat-tab").parentNode; let extensions = document.getElementById("extensions"); @@ -102,18 +106,6 @@ document.addEventListener("keydown", function(event) { document.getElementById("Remove-last").click(); } - // Copy last on Ctrl + Shift + K - else if (event.ctrlKey && event.shiftKey && event.key === "K") { - event.preventDefault(); - document.getElementById("Copy-last").click(); - } - - // Replace last on Ctrl + Shift + L - else if (event.ctrlKey && event.shiftKey && event.key === "L") { - event.preventDefault(); - document.getElementById("Replace-last").click(); - } - // Impersonate on Ctrl + Shift + M else if (event.ctrlKey && event.shiftKey && event.key === "M") { event.preventDefault(); @@ -388,6 +380,16 @@ document.addEventListener("click", function (event) { } }); +document.addEventListener("dblclick", (event) => { + const messageElement = event.target.closest(".message, .user-message, .assistant-message"); + if (!messageElement) return; + + const editButton = messageElement.querySelector(".footer-edit-button"); + if (editButton) { + editButton.click(); + } +}); + //------------------------------------------------ // Relocate the "Show controls" checkbox //------------------------------------------------ diff --git a/modules/chat.py b/modules/chat.py index 6eed47ee..9598efa7 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -712,32 +712,6 @@ def remove_last_message(history): return html.unescape(last[0]), history -def send_last_reply_to_input(history): - if len(history['visible']) > 0: - return html.unescape(history['visible'][-1][1]) - else: - return '' - - -def replace_last_reply(textbox, state): - history = state['history'] - text = textbox['text'] - - # Initialize metadata if not present - if 'metadata' not in history: - history['metadata'] = {} - - if len(text.strip()) == 0: - return history - elif len(history['visible']) > 0: - row_idx = len(history['internal']) - 1 - history['visible'][-1][1] = html.escape(text) - history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True) - update_message_metadata(history['metadata'], "assistant", row_idx, timestamp=get_current_timestamp()) - - return history - - def send_dummy_message(textbox, state): history = state['history'] text = textbox['text'] @@ -1330,14 +1304,6 @@ def my_yaml_output(data): return result -def handle_replace_last_reply_click(text, state): - history = replace_last_reply(text, state) - save_history(history, state['unique_id'], state['character_menu'], state['mode']) - html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - - return [history, html, {"text": "", "files": []}] - - def handle_send_dummy_message_click(text, state): history = send_dummy_message(text, state) save_history(history, state['unique_id'], state['character_menu'], state['mode']) @@ -1425,6 +1391,52 @@ def handle_branch_chat_click(state): return [history, html, past_chats_update, -1] +def handle_edit_message_click(state): + history = state['history'] + message_index = int(state['edit_message_index']) + new_text = state['edit_message_text'] + role = state['edit_message_role'] # "user" or "assistant" + + if message_index >= len(history['internal']): + html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + return [history, html_output, gr.update()] + + # Use the role passed from frontend + is_user_msg = (role == "user") + role_idx = 0 if is_user_msg else 1 + + # For assistant messages, save the original version BEFORE updating content + if not is_user_msg: + if not history['metadata'].get(f"assistant_{message_index}", {}).get('versions'): + add_message_version(history, message_index, is_current=False) + + # NOW update the message content + history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True) + history['visible'][message_index][role_idx] = html.escape(new_text) + + # Branch if editing user message, add version if editing assistant message + if is_user_msg: + # Branch like branch-here + history['visible'] = history['visible'][:message_index + 1] + history['internal'] = history['internal'][:message_index + 1] + new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S') + save_history(history, new_unique_id, state['character_menu'], state['mode']) + histories = find_all_histories_with_first_prompts(state) + past_chats_update = gr.update(choices=histories, value=new_unique_id) + state['unique_id'] = new_unique_id + elif not is_user_msg: + # Add the new version as current + add_message_version(history, message_index, is_current=True) + past_chats_update = gr.update() + else: + past_chats_update = gr.update() + + save_history(history, state['unique_id'], state['character_menu'], state['mode']) + html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + + return [history, html_output, past_chats_update] + + def handle_navigate_version_click(state): history = state['history'] message_index = int(state['navigate_message_index']) diff --git a/modules/html_generator.py b/modules/html_generator.py index 1dfeb445..9a93555f 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -336,12 +336,14 @@ refresh_svg = '''''' remove_svg = '''''' branch_svg = '''''' +edit_svg = '''''' info_svg = '''''' info_svg_small = '''''' attachment_svg = '''''' copy_button = f'' branch_button = f'' +edit_button = f'' refresh_button = f'' continue_button = f'' remove_button = f'' @@ -404,16 +406,23 @@ def get_version_navigation_html(history, i): return f'
{left_arrow}{position}{right_arrow}
' -def actions_html(history, i, info_message=""): +def actions_html(history, i, role, info_message=""): + if role == "assistant": + return (f'
' + f'{copy_button}' + f'{edit_button}' + f'{refresh_button if i == len(history["visible"]) - 1 else ""}' + f'{continue_button if i == len(history["visible"]) - 1 else ""}' + f'{remove_button if i == len(history["visible"]) - 1 else ""}' + f'{branch_button}' + f'{info_message}' + f'
' + f'{get_version_navigation_html(history, i)}') return (f'
' f'{copy_button}' - f'{refresh_button if i == len(history["visible"]) - 1 else ""}' - f'{continue_button if i == len(history["visible"]) - 1 else ""}' - f'{remove_button if i == len(history["visible"]) - 1 else ""}' - f'{branch_button}' + f'{edit_button}' f'{info_message}' - f'
' - f'{get_version_navigation_html(history, i)}') + f'
') def generate_instruct_html(history): @@ -448,11 +457,12 @@ def generate_instruct_html(history): if converted_visible[0]: # Don't display empty user messages output += ( f'
' + f'data-raw="{html.escape(row_internal[0], quote=True)}"' + f'data-index={i}>' f'
' f'
{converted_visible[0]}
' f'{user_attachments}' - f'
{copy_button}{info_message_user}
' + f'{actions_html(history, i, "user", info_message_user)}' f'
' f'
' ) @@ -464,7 +474,7 @@ def generate_instruct_html(history): f'
' f'
{converted_visible[1]}
' f'{assistant_attachments}' - f'{actions_html(history, i, info_message_assistant)}' + f'{actions_html(history, i, "assistant", info_message_assistant)}' f'
' f'
' ) @@ -503,13 +513,14 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache= if converted_visible[0]: # Don't display empty user messages output += ( f'
' + f'data-raw="{html.escape(row_internal[0], quote=True)}"' + f'data-index={i}>' f'
{img_me}
' f'
' f'
{name1}{user_timestamp}
' f'
{converted_visible[0]}
' f'{user_attachments}' - f'
{copy_button}
' + f'{actions_html(history, i, "user")}' f'
' f'
' ) @@ -523,7 +534,7 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache= f'
{name2}{assistant_timestamp}
' f'
{converted_visible[1]}
' f'{assistant_attachments}' - f'{actions_html(history, i)}' + f'{actions_html(history, i, "assistant")}' f'
' f'
' ) @@ -564,11 +575,12 @@ def generate_chat_html(history, name1, name2, reset_cache=False): if converted_visible[0]: # Don't display empty user messages output += ( f'
' + f'data-raw="{html.escape(row_internal[0], quote=True)}"' + f'data-index={i}>' f'
' f'
{converted_visible[0]}
' f'{user_attachments}' - f'
{copy_button}{info_message_user}
' + f'{actions_html(history, i, "user", info_message_user)}' f'
' f'
' ) @@ -580,7 +592,7 @@ def generate_chat_html(history, name1, name2, reset_cache=False): f'
' f'
{converted_visible[1]}
' f'{assistant_attachments}' - f'{actions_html(history, i, info_message_assistant)}' + f'{actions_html(history, i, "assistant", info_message_assistant)}' f'
' f'
' ) diff --git a/modules/ui.py b/modules/ui.py index 52c095a2..00393b53 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -212,7 +212,12 @@ def list_interface_input_elements(): 'negative_prompt', 'dry_sequence_breakers', 'grammar_string', - 'branch_index' + 'navigate_message_index', + 'navigate_direction', + 'edit_message_index', + 'edit_message_text', + 'edit_message_role', + 'branch_index', ] # Chat elements diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 7a9f6f76..2856ce1f 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -71,8 +71,6 @@ def create_ui(): shared.gradio['Remove last'] = gr.Button('Remove last reply (Ctrl + Shift + Backspace)', elem_id='Remove-last') with gr.Row(): - shared.gradio['Replace last reply'] = gr.Button('Replace last reply (Ctrl + Shift + L)', elem_id='Replace-last') - shared.gradio['Copy last reply'] = gr.Button('Copy last reply (Ctrl + Shift + K)', elem_id='Copy-last') shared.gradio['Impersonate'] = gr.Button('Impersonate (Ctrl + Shift + M)', elem_id='Impersonate') with gr.Row(): @@ -97,11 +95,15 @@ def create_ui(): with gr.Row(): shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar']) - # Hidden elements for version navigation (similar to branch) + # Hidden elements for version navigation and editing with gr.Row(visible=False): shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index") shared.gradio['navigate_direction'] = gr.Textbox(value="", elem_id="Navigate-direction") shared.gradio['navigate_version'] = gr.Button(elem_id="Navigate-version") + shared.gradio['edit_message_index'] = gr.Number(value=-1, precision=0, elem_id="Edit-message-index") + shared.gradio['edit_message_text'] = gr.Textbox(value="", elem_id="Edit-message-text") + shared.gradio['edit_message_role'] = gr.Textbox(value="", elem_id="Edit-message-role") + shared.gradio['edit_message'] = gr.Button(elem_id="Edit-message") def create_chat_settings_ui(): @@ -228,10 +230,6 @@ def create_event_handlers(): None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then( None, None, None, js=f'() => {{{ui.audio_notification_js}}}') - shared.gradio['Replace last reply'].click( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.handle_replace_last_reply_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False) - shared.gradio['Send dummy message'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( chat.handle_send_dummy_message_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False) @@ -297,12 +295,16 @@ def create_event_handlers(): None, gradio('mode'), None, js="(mode) => {mode === 'instruct' ? document.getElementById('character-menu').parentNode.parentNode.style.display = 'none' : document.getElementById('character-menu').parentNode.parentNode.style.display = ''}") shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'), show_progress=False) - shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False) shared.gradio['navigate_version'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( chat.handle_navigate_version_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False) + shared.gradio['edit_message'].click( + ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( + chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False).then( + lambda: None, None, None, js='() => { const role = document.getElementById("Edit-message-role").querySelector("textarea").value; if (role === "user") document.getElementById("Regenerate").click(); }') + # Save/delete a character shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False) shared.gradio['delete_character'].click(lambda: gr.update(visible=True), None, gradio('character_deleter'), show_progress=False) From 2db36da979b539263deacbd3ac8b3f6dbba7f97f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 27 May 2025 21:00:11 -0700 Subject: [PATCH 128/164] UI: Make scrollbars more discrete in dark mode --- css/main.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/css/main.css b/css/main.css index 9d68ba02..90dd51bc 100644 --- a/css/main.css +++ b/css/main.css @@ -265,7 +265,7 @@ button { .dark .pretty_scrollbar::-webkit-scrollbar-thumb, .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover { - background: #ccc; + background: rgba(255, 255, 255, 0.2); border-radius: 10px; } From f6ca0ee0727bceac867d5a5bbea0c6d61fea35ea Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 27 May 2025 21:20:51 -0700 Subject: [PATCH 129/164] Fix regenerate sometimes not creating a new message version --- modules/chat.py | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 9598efa7..59ca4d34 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -399,40 +399,26 @@ def get_stopping_strings(state): def add_message_version(history, row_idx, is_current=True): - """Add the current message as a version in the history metadata""" - if 'metadata' not in history: - history['metadata'] = {} - - if row_idx >= len(history['internal']) or not history['internal'][row_idx][1].strip(): - return # Skip if row doesn't exist or message is empty - key = f"assistant_{row_idx}" - - # Initialize metadata structures if needed if key not in history['metadata']: - history['metadata'][key] = {"timestamp": get_current_timestamp()} + history['metadata'][key] = {} + if "versions" not in history['metadata'][key]: history['metadata'][key]["versions"] = [] - # Check if this version already exists current_content = history['internal'][row_idx][1] current_visible = history['visible'][row_idx][1] - for i, version in enumerate(history['metadata'][key]["versions"]): - if version['content'] == current_content and version['visible_content'] == current_visible: - if is_current: - history['metadata'][key]["current_version_index"] = i - return - - # Add current message as a version + # Always add the current message as a new version entry. + # The timestamp will differentiate it even if content is identical to a previous version. history['metadata'][key]["versions"].append({ "content": current_content, "visible_content": current_visible, "timestamp": get_current_timestamp() }) - # Update index if this is the current version if is_current: + # Set the current_version_index to the newly added version (which is now the last one). history['metadata'][key]["current_version_index"] = len(history['metadata'][key]["versions"]) - 1 From 1b0e2d8750ee315086acb2738fab76ad28abadb8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 27 May 2025 22:36:24 -0700 Subject: [PATCH 130/164] UI: Add a token counter to the chat tab (counts input + history) --- css/main.css | 7 ++++++ modules/chat.py | 54 +++++++++++++++++++++++++++++++++++++++++++++- modules/ui_chat.py | 9 ++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/css/main.css b/css/main.css index 90dd51bc..6e030453 100644 --- a/css/main.css +++ b/css/main.css @@ -1542,3 +1542,10 @@ strong { opacity: 0.8; user-select: none; } + +.token-display { + font-family: monospace; + font-size: 13px; + color: var(--body-text-color-subdued); + margin-top: 4px; +} diff --git a/modules/chat.py b/modules/chat.py index 59ca4d34..498c0d88 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -230,7 +230,15 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg}) user_input = user_input.strip() - if user_input and not impersonate and not _continue: + + # Check if we have attachments even with empty input + has_attachments = False + if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0: + current_row_idx = len(history) + user_key = f"user_{current_row_idx}" + has_attachments = user_key in metadata and "attachments" in metadata[user_key] + + if (user_input or has_attachments) and not impersonate and not _continue: # For the current user input being processed, check if we need to add attachments if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0: current_row_idx = len(history) @@ -350,6 +358,50 @@ def generate_chat_prompt(user_input, state, **kwargs): return prompt +def count_prompt_tokens(text_input, state): + """Count tokens for current history + input including attachments""" + if shared.tokenizer is None: + return "Tokenizer not available" + + try: + # Handle dict format with text and files + files = [] + if isinstance(text_input, dict): + files = text_input.get('files', []) + text = text_input.get('text', '') + else: + text = text_input + files = [] + + # Create temporary history copy to add attachments + temp_history = copy.deepcopy(state['history']) + if 'metadata' not in temp_history: + temp_history['metadata'] = {} + + # Process attachments if any + if files: + row_idx = len(temp_history['internal']) + for file_path in files: + add_message_attachment(temp_history, row_idx, file_path, is_user=True) + + # Create temp state with modified history + temp_state = copy.deepcopy(state) + temp_state['history'] = temp_history + + # Build prompt using existing logic + prompt = generate_chat_prompt(text, temp_state) + current_tokens = get_encoded_length(prompt) + max_tokens = temp_state['truncation_length'] + + percentage = (current_tokens / max_tokens) * 100 if max_tokens > 0 else 0 + + return f"History + Input:
{current_tokens:,} / {max_tokens:,} tokens ({percentage:.1f}%)" + + except Exception as e: + logger.error(f"Error counting tokens: {e}") + return f"Error: {str(e)}" + + def get_stopping_strings(state): stopping_strings = [] renderers = [] diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 2856ce1f..952a40a5 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -95,6 +95,11 @@ def create_ui(): with gr.Row(): shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar']) + with gr.Row(): + shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm') + + shared.gradio['token_display'] = gr.HTML(value='', elem_classes='token-display') + # Hidden elements for version navigation and editing with gr.Row(visible=False): shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index") @@ -360,3 +365,7 @@ def create_event_handlers(): None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}') shared.gradio['show_controls'].change(None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}') + + shared.gradio['count_tokens'].click( + ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( + chat.count_prompt_tokens, gradio('textbox', 'interface_state'), gradio('token_display'), show_progress=False) From 077bbc6b101f8f6045b95369bc82373187741d12 Mon Sep 17 00:00:00 2001 From: oobabooga Date: Wed, 28 May 2025 04:27:28 -0300 Subject: [PATCH 131/164] Add web search support (#7023) --- modules/chat.py | 4 + modules/ui.py | 6 +- modules/ui_chat.py | 12 ++ modules/web_search.py | 125 ++++++++++++++++++ requirements/full/requirements.txt | 2 + requirements/full/requirements_amd.txt | 2 + requirements/full/requirements_amd_noavx2.txt | 2 + .../full/requirements_apple_intel.txt | 2 + .../full/requirements_apple_silicon.txt | 2 + requirements/full/requirements_cpu_only.txt | 2 + .../full/requirements_cpu_only_noavx2.txt | 2 + requirements/full/requirements_noavx2.txt | 2 + requirements/full/requirements_nowheels.txt | 2 + requirements/portable/requirements.txt | 2 + .../portable/requirements_apple_intel.txt | 2 + .../portable/requirements_apple_silicon.txt | 2 + .../portable/requirements_cpu_only.txt | 2 + .../portable/requirements_cpu_only_noavx2.txt | 2 + requirements/portable/requirements_noavx2.txt | 2 + .../portable/requirements_nowheels.txt | 2 + requirements/portable/requirements_vulkan.txt | 2 + .../portable/requirements_vulkan_noavx2.txt | 2 + 22 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 modules/web_search.py diff --git a/modules/chat.py b/modules/chat.py index 498c0d88..b2aacd5c 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -31,6 +31,7 @@ from modules.text_generation import ( get_max_prompt_length ) from modules.utils import delete_file, get_available_characters, save_file +from modules.web_search import add_web_search_attachments def strftime_now(format): @@ -566,6 +567,9 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess for file_path in files: add_message_attachment(output, row_idx, file_path, is_user=True) + # Add web search results as attachments if enabled + add_web_search_attachments(output, row_idx, text, state) + # Apply extensions text, visible_text = apply_extensions('chat_input', text, visible_text, state) text = apply_extensions('input', text, state, is_chat=True) diff --git a/modules/ui.py b/modules/ui.py index 00393b53..e24e6402 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -157,8 +157,6 @@ def list_model_elements(): def list_interface_input_elements(): elements = [ - 'navigate_message_index', - 'navigate_direction', 'temperature', 'dynatemp_low', 'dynatemp_high', @@ -218,6 +216,10 @@ def list_interface_input_elements(): 'edit_message_text', 'edit_message_role', 'branch_index', + 'enable_web_search', + 'web_search_pages', + 'navigate_message_index', + 'navigate_direction', ] # Chat elements diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 952a40a5..719af85a 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -86,6 +86,12 @@ def create_ui(): with gr.Row(): shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar']) + with gr.Row(): + shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search') + + with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']: + shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10) + with gr.Row(): shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode') @@ -369,3 +375,9 @@ def create_event_handlers(): shared.gradio['count_tokens'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( chat.count_prompt_tokens, gradio('textbox', 'interface_state'), gradio('token_display'), show_progress=False) + + shared.gradio['enable_web_search'].change( + lambda x: gr.update(visible=x), + gradio('enable_web_search'), + gradio('web_search_row') + ) diff --git a/modules/web_search.py b/modules/web_search.py new file mode 100644 index 00000000..e7688ba4 --- /dev/null +++ b/modules/web_search.py @@ -0,0 +1,125 @@ +from datetime import datetime + +import requests +from bs4 import BeautifulSoup +from duckduckgo_search import DDGS + +from modules.logging_colors import logger +from modules.text_generation import generate_reply + + +def get_current_timestamp(): + """Returns the current time in 24-hour format""" + return datetime.now().strftime('%b %d, %Y %H:%M') + + +def generate_search_query(user_message, state): + """Generate a search query from user message using the LLM""" + search_prompt = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else." + + # Use a minimal state for search query generation + search_state = state.copy() + search_state['max_new_tokens'] = 64 + search_state['temperature'] = 0.1 + + query = "" + for reply in generate_reply(search_prompt, search_state, stopping_strings=[], is_chat=False): + query = reply.strip() + + return query + + +def download_web_page(url, timeout=10): + """Download and extract text from a web page""" + try: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + response = requests.get(url, headers=headers, timeout=timeout) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html.parser') + + # Remove script and style elements + for script in soup(["script", "style"]): + script.decompose() + + # Get text and clean it up + text = soup.get_text() + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + text = ' '.join(chunk for chunk in chunks if chunk) + + return text + except Exception as e: + logger.error(f"Error downloading {url}: {e}") + return f"[Error downloading content from {url}: {str(e)}]" + + +def perform_web_search(query, num_pages=3): + """Perform web search and return results with content""" + try: + with DDGS() as ddgs: + results = list(ddgs.text(query, max_results=num_pages)) + + search_results = [] + for i, result in enumerate(results): + url = result.get('href', '') + title = result.get('title', f'Search Result {i+1}') + + # Download page content + content = download_web_page(url) + + search_results.append({ + 'title': title, + 'url': url, + 'content': content + }) + + return search_results + except Exception as e: + logger.error(f"Error performing web search: {e}") + return [] + + +def add_web_search_attachments(history, row_idx, user_message, state): + """Perform web search and add results as attachments""" + if not state.get('enable_web_search', False): + return + + try: + # Generate search query + search_query = generate_search_query(user_message, state) + if not search_query: + logger.warning("Failed to generate search query") + return + + logger.info(f"Generated search query: {search_query}") + + # Perform web search + num_pages = int(state.get('web_search_pages', 3)) + search_results = perform_web_search(search_query, num_pages) + + if not search_results: + logger.warning("No search results found") + return + + # Add search results as attachments + key = f"user_{row_idx}" + if key not in history['metadata']: + history['metadata'][key] = {"timestamp": get_current_timestamp()} + if "attachments" not in history['metadata'][key]: + history['metadata'][key]["attachments"] = [] + + for result in search_results: + attachment = { + "name": f"{result['title']}", + "type": "text/html", + "content": f"URL: {result['url']}\n\n{result['content']}" + } + history['metadata'][key]["attachments"].append(attachment) + + logger.info(f"Added {len(search_results)} web search results as attachments") + + except Exception as e: + logger.error(f"Error in web search: {e}") diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 3d18f5fd..0eaf10da 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -1,7 +1,9 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 bitsandbytes==0.45.* colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 82b19964..65f184bf 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index a8b03014..d20b2ec3 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 5a61ac7d..2613d787 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 6862c3b4..af583b00 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index e6982779..9bf2a37d 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 97bff786..1731448e 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 17c7e246..fc481a1a 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -1,7 +1,9 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 bitsandbytes==0.45.* colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 89b32caf..2ed8affa 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index ec9bafc6..fdae681d 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -1,3 +1,5 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 025a737e..a58f39f7 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -1,3 +1,5 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 32644e87..91ea3a6d 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -1,3 +1,5 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index bd5c1d9b..37e5aa40 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -1,3 +1,5 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 51f2b7d9..dcb2884b 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -1,3 +1,5 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index aad6bf5a..8f1295bb 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -1,3 +1,5 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt index 4c055426..21805fe2 100644 --- a/requirements/portable/requirements_nowheels.txt +++ b/requirements/portable/requirements_nowheels.txt @@ -1,3 +1,5 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 3d98d1b0..858b4488 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -1,3 +1,5 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index f954b8d2..569bae99 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -1,3 +1,5 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 From 75c6ae8502cae60bd8dabef1e2af4aec5766ca35 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 00:29:17 -0700 Subject: [PATCH 132/164] UI: Don't edit messages on double click --- js/main.js | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/js/main.js b/js/main.js index fc014f66..48bb8632 100644 --- a/js/main.js +++ b/js/main.js @@ -380,16 +380,6 @@ document.addEventListener("click", function (event) { } }); -document.addEventListener("dblclick", (event) => { - const messageElement = event.target.closest(".message, .user-message, .assistant-message"); - if (!messageElement) return; - - const editButton = messageElement.querySelector(".footer-edit-button"); - if (editButton) { - editButton.click(); - } -}); - //------------------------------------------------ // Relocate the "Show controls" checkbox //------------------------------------------------ From 0aedb8992165b386dac244baeb5fb5967513869e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 00:35:20 -0700 Subject: [PATCH 133/164] UI: Small style improvement to attachments --- css/main.css | 1 + 1 file changed, 1 insertion(+) diff --git a/css/main.css b/css/main.css index 6e030453..181a19b8 100644 --- a/css/main.css +++ b/css/main.css @@ -1417,6 +1417,7 @@ strong { flex-wrap: wrap; gap: 8px; margin-top: 8px; + padding-bottom: 6px; } .attachment-box { From 6c3590ba9ab0bd540097a50986a59f0099d11d92 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 05:28:15 -0700 Subject: [PATCH 134/164] Make web search attachments clickable --- modules/html_generator.py | 8 +++++++- modules/web_search.py | 5 +++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/modules/html_generator.py b/modules/html_generator.py index 9a93555f..bfb278cd 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -370,10 +370,16 @@ def format_message_attachments(history, role, index): attachments_html = '
' for attachment in attachments: + name = html.escape(attachment["name"]) + + # Make clickable if URL exists + if "url" in attachment: + name = f'{name}' + attachments_html += ( f'
' f'
{attachment_svg}
' - f'
{html.escape(attachment["name"])}
' + f'
{name}
' f'
' ) attachments_html += '
' diff --git a/modules/web_search.py b/modules/web_search.py index e7688ba4..d3387ac9 100644 --- a/modules/web_search.py +++ b/modules/web_search.py @@ -113,9 +113,10 @@ def add_web_search_attachments(history, row_idx, user_message, state): for result in search_results: attachment = { - "name": f"{result['title']}", + "name": result['title'], "type": "text/html", - "content": f"URL: {result['url']}\n\n{result['content']}" + "url": result['url'], + "content": result['content'] } history['metadata'][key]["attachments"].append(attachment) From 27641ac1823751165615a1a53b62ae24977e37a0 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 17:09:05 -0700 Subject: [PATCH 135/164] UI: Make message editing work the same for user and assistant messages --- js/global_scope_js.js | 28 ++++++------ modules/chat.py | 94 ++++++++++++++++++++------------------- modules/html_generator.py | 42 ++++++++++------- modules/ui.py | 3 +- modules/ui_chat.py | 4 +- 5 files changed, 94 insertions(+), 77 deletions(-) diff --git a/js/global_scope_js.js b/js/global_scope_js.js index 0e86d450..3274f47e 100644 --- a/js/global_scope_js.js +++ b/js/global_scope_js.js @@ -186,31 +186,33 @@ function navigateVersion(element, direction) { const index = messageElement.getAttribute("data-index"); if (!index) return; - const indexInput = document.getElementById("Navigate-message-index").querySelector("input"); - if (!indexInput) { - console.error("Element with ID 'Navigate-message-index' not found."); - return; - } - - const directionInput = document.getElementById("Navigate-direction").querySelector("textarea"); - if (!directionInput) { - console.error("Element with ID 'Navigate-direction' not found."); - return; + // Determine role based on message element classes + let role = "assistant"; // Default role + if (messageElement.classList.contains("user-message") || + messageElement.querySelector(".text-you") || + messageElement.querySelector(".circle-you")) { + role = "user"; } + const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input"); + const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea"); + const roleInput = document.getElementById("Navigate-message-role")?.querySelector("textarea"); const navigateButton = document.getElementById("Navigate-version"); - if (!navigateButton) { - console.error("Required element 'Navigate-version' not found."); + + if (!indexInput || !directionInput || !roleInput || !navigateButton) { + console.error("Navigation control elements (index, direction, role, or button) not found."); return; } indexInput.value = index; directionInput.value = direction; + roleInput.value = role; - // Trigger any 'change' or 'input' events Gradio might be listening for + // Trigger 'input' events for Gradio to pick up changes const event = new Event("input", { bubbles: true }); indexInput.dispatchEvent(event); directionInput.dispatchEvent(event); + roleInput.dispatchEvent(event); navigateButton.click(); } diff --git a/modules/chat.py b/modules/chat.py index b2aacd5c..8bac680c 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -451,19 +451,21 @@ def get_stopping_strings(state): return result -def add_message_version(history, row_idx, is_current=True): - key = f"assistant_{row_idx}" +def add_message_version(history, role, row_idx, is_current=True): + key = f"{role}_{row_idx}" + if 'metadata' not in history: + history['metadata'] = {} if key not in history['metadata']: history['metadata'][key] = {} if "versions" not in history['metadata'][key]: history['metadata'][key]["versions"] = [] - current_content = history['internal'][row_idx][1] - current_visible = history['visible'][row_idx][1] + # Determine which index to use for content based on role + content_idx = 0 if role == 'user' else 1 + current_content = history['internal'][row_idx][content_idx] + current_visible = history['visible'][row_idx][content_idx] - # Always add the current message as a new version entry. - # The timestamp will differentiate it even if content is identical to a previous version. history['metadata'][key]["versions"].append({ "content": current_content, "visible_content": current_visible, @@ -594,7 +596,7 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess # Store the first response as a version before regenerating if not output['metadata'].get(f"assistant_{row_idx}", {}).get('versions'): - add_message_version(output, row_idx, is_current=False) + add_message_version(output, "assistant", row_idx, is_current=False) if loading_message: yield { @@ -656,12 +658,13 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess if is_stream: yield output + output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True) + # Add the newly generated response as a version (only for regeneration) if regenerate: row_idx = len(output['internal']) - 1 - add_message_version(output, row_idx, is_current=True) + add_message_version(output, "assistant", row_idx, is_current=True) - output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True) yield output @@ -1441,37 +1444,35 @@ def handle_edit_message_click(state): if message_index >= len(history['internal']): html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html_output, gr.update()] + return [history, html_output, gr.update()] # No unique_id change - # Use the role passed from frontend - is_user_msg = (role == "user") - role_idx = 0 if is_user_msg else 1 + role_idx = 0 if role == "user" else 1 - # For assistant messages, save the original version BEFORE updating content - if not is_user_msg: - if not history['metadata'].get(f"assistant_{message_index}", {}).get('versions'): - add_message_version(history, message_index, is_current=False) + if 'metadata' not in history: + history['metadata'] = {} + + key = f"{role}_{message_index}" + if key not in history['metadata']: + history['metadata'][key] = {} + + # If no versions exist yet for this message, store the current (pre-edit) content as the first version. + if "versions" not in history['metadata'][key] or not history['metadata'][key]["versions"]: + original_content = history['internal'][message_index][role_idx] + original_visible = history['visible'][message_index][role_idx] + + history['metadata'][key]["versions"] = [{ + "content": original_content, + "visible_content": original_visible, + "timestamp": get_current_timestamp() + }] - # NOW update the message content history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True) history['visible'][message_index][role_idx] = html.escape(new_text) - # Branch if editing user message, add version if editing assistant message - if is_user_msg: - # Branch like branch-here - history['visible'] = history['visible'][:message_index + 1] - history['internal'] = history['internal'][:message_index + 1] - new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S') - save_history(history, new_unique_id, state['character_menu'], state['mode']) - histories = find_all_histories_with_first_prompts(state) - past_chats_update = gr.update(choices=histories, value=new_unique_id) - state['unique_id'] = new_unique_id - elif not is_user_msg: - # Add the new version as current - add_message_version(history, message_index, is_current=True) - past_chats_update = gr.update() - else: - past_chats_update = gr.update() + add_message_version(history, role, message_index, is_current=True) + + # Since we are not branching, unique_id does not change. + past_chats_update = gr.update() save_history(history, state['unique_id'], state['character_menu'], state['mode']) html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) @@ -1483,33 +1484,36 @@ def handle_navigate_version_click(state): history = state['history'] message_index = int(state['navigate_message_index']) direction = state['navigate_direction'] + role = state['navigate_message_role'] - # Get assistant message metadata - key = f"assistant_{message_index}" - if key not in history['metadata'] or 'versions' not in history['metadata'][key]: - # No versions to navigate + if not role: + logger.error("Role not provided for version navigation.") + html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + return [history, html] + + key = f"{role}_{message_index}" + if 'metadata' not in history or key not in history['metadata'] or 'versions' not in history['metadata'][key]: html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) return [history, html] metadata = history['metadata'][key] - current_idx = metadata.get('current_version_index', 0) versions = metadata['versions'] + # Default to the last version if current_version_index is not set + current_idx = metadata.get('current_version_index', len(versions) - 1 if versions else 0) - # Calculate new index if direction == 'left': new_idx = max(0, current_idx - 1) else: # right new_idx = min(len(versions) - 1, current_idx + 1) if new_idx == current_idx: - # No change needed html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) return [history, html] - # Update history with new version - version = versions[new_idx] - history['internal'][message_index][1] = version['content'] - history['visible'][message_index][1] = version['visible_content'] + msg_content_idx = 0 if role == 'user' else 1 # 0 for user content, 1 for assistant content in the pair + version_to_load = versions[new_idx] + history['internal'][message_index][msg_content_idx] = version_to_load['content'] + history['visible'][message_index][msg_content_idx] = version_to_load['visible_content'] metadata['current_version_index'] = new_idx # Redraw and save diff --git a/modules/html_generator.py b/modules/html_generator.py index bfb278cd..cbf3e19c 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -388,16 +388,17 @@ def format_message_attachments(history, role, index): return "" -def get_version_navigation_html(history, i): +def get_version_navigation_html(history, i, role): """Generate simple navigation arrows for message versions""" - key = f"assistant_{i}" + key = f"{role}_{i}" metadata = history.get('metadata', {}) if key not in metadata or 'versions' not in metadata[key]: return "" versions = metadata[key]['versions'] - current_idx = metadata[key].get('current_version_index', 0) + # Default to the last version if current_version_index isn't set in metadata + current_idx = metadata[key].get('current_version_index', len(versions) - 1 if versions else 0) if len(versions) <= 1: return "" @@ -413,22 +414,33 @@ def get_version_navigation_html(history, i): def actions_html(history, i, role, info_message=""): + action_buttons = "" + version_nav_html = "" + if role == "assistant": - return (f'
' - f'{copy_button}' - f'{edit_button}' - f'{refresh_button if i == len(history["visible"]) - 1 else ""}' - f'{continue_button if i == len(history["visible"]) - 1 else ""}' - f'{remove_button if i == len(history["visible"]) - 1 else ""}' - f'{branch_button}' - f'{info_message}' - f'
' - f'{get_version_navigation_html(history, i)}') - return (f'
' + action_buttons = ( f'{copy_button}' f'{edit_button}' + f'{refresh_button if i == len(history["visible"]) - 1 else ""}' + f'{continue_button if i == len(history["visible"]) - 1 else ""}' + f'{remove_button if i == len(history["visible"]) - 1 else ""}' + f'{branch_button}' + ) + + version_nav_html = get_version_navigation_html(history, i, "assistant") + elif role == "user": + action_buttons = ( + f'{copy_button}' + f'{edit_button}' + ) + + version_nav_html = get_version_navigation_html(history, i, "user") + + return (f'
' + f'{action_buttons}' f'{info_message}' - f'
') + f'
' + f'{version_nav_html}') def generate_instruct_html(history): diff --git a/modules/ui.py b/modules/ui.py index e24e6402..a2662e14 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -212,14 +212,13 @@ def list_interface_input_elements(): 'grammar_string', 'navigate_message_index', 'navigate_direction', + 'navigate_message_role', 'edit_message_index', 'edit_message_text', 'edit_message_role', 'branch_index', 'enable_web_search', 'web_search_pages', - 'navigate_message_index', - 'navigate_direction', ] # Chat elements diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 719af85a..df3d3929 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -110,6 +110,7 @@ def create_ui(): with gr.Row(visible=False): shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index") shared.gradio['navigate_direction'] = gr.Textbox(value="", elem_id="Navigate-direction") + shared.gradio['navigate_message_role'] = gr.Textbox(value="", elem_id="Navigate-message-role") shared.gradio['navigate_version'] = gr.Button(elem_id="Navigate-version") shared.gradio['edit_message_index'] = gr.Number(value=-1, precision=0, elem_id="Edit-message-index") shared.gradio['edit_message_text'] = gr.Textbox(value="", elem_id="Edit-message-text") @@ -313,8 +314,7 @@ def create_event_handlers(): shared.gradio['edit_message'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False).then( - lambda: None, None, None, js='() => { const role = document.getElementById("Edit-message-role").querySelector("textarea").value; if (role === "user") document.getElementById("Regenerate").click(); }') + chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False) # Save/delete a character shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False) From 3eb0b77427ad7b87c128999fd915f97b22104819 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 18:14:51 -0700 Subject: [PATCH 136/164] Improve the web search query generation --- modules/chat.py | 25 ++++++++++++++++++++++++- modules/web_search.py | 29 ++++------------------------- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 8bac680c..495fe934 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -538,6 +538,27 @@ def extract_pdf_text(pdf_path): return f"[Error extracting PDF text: {str(e)}]" +def generate_search_query(user_message, state): + """Generate a search query from user message using the LLM""" + # Augment the user message with search instruction + augmented_message = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else." + + # Use a minimal state for search query generation but keep the full history + search_state = state.copy() + search_state['max_new_tokens'] = 64 + search_state['auto_max_new_tokens'] = False + search_state['enable_thinking'] = False + + # Generate the full prompt using existing history + augmented message + formatted_prompt = generate_chat_prompt(augmented_message, search_state) + + query = "" + for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True): + query = reply.strip() + + return query + + def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False): # Handle dict format with text and files files = [] @@ -570,7 +591,9 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess add_message_attachment(output, row_idx, file_path, is_user=True) # Add web search results as attachments if enabled - add_web_search_attachments(output, row_idx, text, state) + if state.get('enable_web_search', False): + search_query = generate_search_query(text, state) + add_web_search_attachments(output, row_idx, text, search_query, state) # Apply extensions text, visible_text = apply_extensions('chat_input', text, visible_text, state) diff --git a/modules/web_search.py b/modules/web_search.py index d3387ac9..667178c5 100644 --- a/modules/web_search.py +++ b/modules/web_search.py @@ -13,22 +13,6 @@ def get_current_timestamp(): return datetime.now().strftime('%b %d, %Y %H:%M') -def generate_search_query(user_message, state): - """Generate a search query from user message using the LLM""" - search_prompt = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else." - - # Use a minimal state for search query generation - search_state = state.copy() - search_state['max_new_tokens'] = 64 - search_state['temperature'] = 0.1 - - query = "" - for reply in generate_reply(search_prompt, search_state, stopping_strings=[], is_chat=False): - query = reply.strip() - - return query - - def download_web_page(url, timeout=10): """Download and extract text from a web page""" try: @@ -82,19 +66,14 @@ def perform_web_search(query, num_pages=3): return [] -def add_web_search_attachments(history, row_idx, user_message, state): +def add_web_search_attachments(history, row_idx, user_message, search_query, state): """Perform web search and add results as attachments""" - if not state.get('enable_web_search', False): + if not search_query: + logger.warning("No search query provided") return try: - # Generate search query - search_query = generate_search_query(user_message, state) - if not search_query: - logger.warning("Failed to generate search query") - return - - logger.info(f"Generated search query: {search_query}") + logger.info(f"Using search query: {search_query}") # Perform web search num_pages = int(state.get('web_search_pages', 3)) From 7080a02252b9949297950ef3669361d21f4a6bcf Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 18:15:21 -0700 Subject: [PATCH 137/164] Reduce the timeout for downloading web pages --- modules/web_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/web_search.py b/modules/web_search.py index 667178c5..070f850c 100644 --- a/modules/web_search.py +++ b/modules/web_search.py @@ -13,7 +13,7 @@ def get_current_timestamp(): return datetime.now().strftime('%b %d, %Y %H:%M') -def download_web_page(url, timeout=10): +def download_web_page(url, timeout=5): """Download and extract text from a web page""" try: headers = { From 75d6cfd14d1aed5ba19bd747479794cbd34212d0 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 28 May 2025 20:34:14 -0700 Subject: [PATCH 138/164] Download fetched web search results in parallel --- modules/web_search.py | 44 +++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/modules/web_search.py b/modules/web_search.py index 070f850c..1f670349 100644 --- a/modules/web_search.py +++ b/modules/web_search.py @@ -1,3 +1,5 @@ +import concurrent.futures +from concurrent.futures import as_completed from datetime import datetime import requests @@ -5,7 +7,6 @@ from bs4 import BeautifulSoup from duckduckgo_search import DDGS from modules.logging_colors import logger -from modules.text_generation import generate_reply def get_current_timestamp(): @@ -40,27 +41,50 @@ def download_web_page(url, timeout=5): return f"[Error downloading content from {url}: {str(e)}]" -def perform_web_search(query, num_pages=3): +def perform_web_search(query, num_pages=3, max_workers=5): """Perform web search and return results with content""" try: with DDGS() as ddgs: results = list(ddgs.text(query, max_results=num_pages)) - search_results = [] + # Prepare download tasks + download_tasks = [] for i, result in enumerate(results): url = result.get('href', '') title = result.get('title', f'Search Result {i+1}') + download_tasks.append((url, title, i)) - # Download page content - content = download_web_page(url) + search_results = [None] * len(download_tasks) # Pre-allocate to maintain order - search_results.append({ - 'title': title, - 'url': url, - 'content': content - }) + # Download pages in parallel + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all download tasks + future_to_task = { + executor.submit(download_web_page, task[0]): task + for task in download_tasks + } + + # Collect results as they complete + for future in as_completed(future_to_task): + url, title, index = future_to_task[future] + try: + content = future.result() + search_results[index] = { + 'title': title, + 'url': url, + 'content': content + } + except Exception as e: + logger.error(f"Error downloading {url}: {e}") + # Include failed downloads with empty content + search_results[index] = { + 'title': title, + 'url': url, + 'content': '' + } return search_results + except Exception as e: logger.error(f"Error performing web search: {e}") return [] From 63234b9b6f60ec4f276480b4e7f9d4cd1395dcaf Mon Sep 17 00:00:00 2001 From: Underscore <47636331+Th-Underscore@users.noreply.github.com> Date: Thu, 29 May 2025 07:22:03 -0400 Subject: [PATCH 139/164] UI: Fix impersonate (#7025) --- modules/chat.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 495fe934..7afd906d 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -691,16 +691,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess yield output -def impersonate_wrapper(text, state): +def impersonate_wrapper(textbox, state): + text = textbox['text'] static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) prompt = generate_chat_prompt('', state, impersonate=True) stopping_strings = get_stopping_strings(state) - yield text + '...', static_output + textbox['text'] = text + '...' + yield textbox, static_output reply = None for reply in generate_reply(prompt + text, state, stopping_strings=stopping_strings, is_chat=True): - yield (text + reply).lstrip(' '), static_output + textbox['text'] = (text + reply).lstrip(' ') + yield textbox, static_output if shared.stop_everything: return From a8d02dec8f5e6a054a153b3b09425b51e090ae11 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 04:24:21 -0700 Subject: [PATCH 140/164] Bump llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 16 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 0eaf10da..5f61aff9 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 65f184bf..a718b6ca 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -32,7 +32,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index d20b2ec3..5fddc623 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -32,7 +32,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 2613d787..8e014445 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -32,7 +32,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index af583b00..77779f3d 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -32,8 +32,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 9bf2a37d..79efc607 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -32,5 +32,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 1731448e..8b29453e 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -32,5 +32,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index fc481a1a..f1f4a02e 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index fdae681d..adf50d9a 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index a58f39f7..46b36791 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 91ea3a6d..66052711 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -18,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 37e5aa40..4013abcc 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index dcb2884b..41808854 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 8f1295bb..cff79ec6 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 858b4488..762b3fa3 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 569bae99..b425d305 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 685cfe254036111711de027f6d3a8198d02e7545 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 04:26:43 -0700 Subject: [PATCH 141/164] Lint --- css/main.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/css/main.css b/css/main.css index 181a19b8..8af87b42 100644 --- a/css/main.css +++ b/css/main.css @@ -265,7 +265,7 @@ button { .dark .pretty_scrollbar::-webkit-scrollbar-thumb, .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover { - background: rgba(255, 255, 255, 0.2); + background: rgb(255 255 255 / 20%); border-radius: 10px; } From f2ee917d4f600ebbc5fa9d5fcf65cf5feef27fc1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 04:55:05 -0700 Subject: [PATCH 142/164] Update README --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7105ce23..afb21cb0 100644 --- a/README.md +++ b/README.md @@ -14,14 +14,17 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)). - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment. +- **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents. +- **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation. +- **Advanced chat management**: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point. +- **Automatic prompt formatting** using Jinja2 templates. You don't need to ever worry about prompt formats. - UI that resembles the original ChatGPT style. -- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats. - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`. - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these. - Multiple sampling parameters and generation options for sophisticated text generation control. - Switch between different models easily in the UI without restarting, with fine control over settings. - OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples). -- 100% offline and private, with zero telemetry, external resources, or remote update requests. +- 100% offline and private, with zero telemetry, external resources, or remote update requests. Web search is optional and user-controlled. - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details. ## How to install From 2a9699033d90f4ffedfb22cbba7003c6441d08dc Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 04:55:59 -0700 Subject: [PATCH 143/164] Update README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index afb21cb0..05809436 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,8 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment. - **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents. - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation. -- **Advanced chat management**: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point. -- **Automatic prompt formatting** using Jinja2 templates. You don't need to ever worry about prompt formats. +- Advanced chat management: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point. +- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats. - UI that resembles the original ChatGPT style. - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`. - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these. From 9a94d7b4f6ae95b6b4b2fc521b5b25c300915dc9 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 05:02:52 -0700 Subject: [PATCH 144/164] Update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 05809436..900d5fbd 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## Features - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)). -- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment. +- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory without affecting your system. - **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents. - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation. - Advanced chat management: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point. From 0986d075fb22dc5aa582bbefdfdb0ebdb6ee92c8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 05:03:59 -0700 Subject: [PATCH 145/164] Update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 900d5fbd..ec01c0aa 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory without affecting your system. - **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents. - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation. -- Advanced chat management: Edit messages, navigate between message versions (like "swipes"), and branch conversations at any point. +- Advanced chat management: Edit messages, navigate between message versions, and branch conversations at any point. - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats. - UI that resembles the original ChatGPT style. - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`. From 36bc2760058ed4e6998f4c55176c7311b0facabe Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 05:39:26 -0700 Subject: [PATCH 146/164] Update README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ec01c0aa..9accffb7 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation. - Advanced chat management: Edit messages, navigate between message versions, and branch conversations at any point. - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats. +- Automatic GPU layers for GGUF models (on NVIDIA GPUs). - UI that resembles the original ChatGPT style. - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`. - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these. From 81794692ab6fbc0ef24c7484b6571de090984dde Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 08:07:14 -0700 Subject: [PATCH 147/164] UI: Make the dark theme darker --- css/main.css | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/css/main.css b/css/main.css index 8af87b42..0d0a13cf 100644 --- a/css/main.css +++ b/css/main.css @@ -1,11 +1,11 @@ :root { --darker-gray: #202123; - --dark-gray: #343541; - --light-gray: #444654; + --dark-gray: #2A2B32; + --light-gray: #373943; --light-theme-gray: #f9fbff; --border-color-dark: #525252; --header-width: 112px; - --selected-item-color-dark: #32333e; + --selected-item-color-dark: #2E2F38; } @font-face { From c970c5f1665c3966c84ba50a05a45d2598038ea6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 08:15:13 -0700 Subject: [PATCH 148/164] Make scrollbars darker in dark theme --- css/main.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/css/main.css b/css/main.css index 0d0a13cf..7f9d4618 100644 --- a/css/main.css +++ b/css/main.css @@ -265,7 +265,7 @@ button { .dark .pretty_scrollbar::-webkit-scrollbar-thumb, .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover { - background: rgb(255 255 255 / 20%); + background: rgb(255 255 255 / 10%); border-radius: 10px; } From 3f37a2e915a31b273caddd12a80412a199d753a7 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 08:49:31 -0700 Subject: [PATCH 149/164] Update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9accffb7..361584f8 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## Features - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)). -- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory without affecting your system. +- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory. - **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents. - **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation. - Advanced chat management: Edit messages, navigate between message versions, and branch conversations at any point. From faa5c82c64e2036762ed3ff60a38fc5b37dac36d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 09:02:34 -0700 Subject: [PATCH 150/164] Fix message version count not updating during regeneration streaming --- modules/chat.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 7afd906d..90d66687 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -617,10 +617,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess if regenerate: row_idx = len(output['internal']) - 1 - # Store the first response as a version before regenerating + # Store the old response as a version before regenerating if not output['metadata'].get(f"assistant_{row_idx}", {}).get('versions'): add_message_version(output, "assistant", row_idx, is_current=False) + # Add new empty version (will be filled during streaming) + key = f"assistant_{row_idx}" + output['metadata'][key]["versions"].append({ + "content": "", + "visible_content": "", + "timestamp": get_current_timestamp() + }) + output['metadata'][key]["current_version_index"] = len(output['metadata'][key]["versions"]) - 1 + if loading_message: yield { 'visible': output['visible'][:-1] + [[visible_text, shared.processing_message]], @@ -673,20 +682,34 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess if _continue: output['internal'][-1] = [text, last_reply[0] + reply] output['visible'][-1] = [visible_text, last_reply[1] + visible_reply] - if is_stream: - yield output elif not (j == 0 and visible_reply.strip() == ''): output['internal'][-1] = [text, reply.lstrip(' ')] output['visible'][-1] = [visible_text, visible_reply.lstrip(' ')] - if is_stream: - yield output + + # Keep version metadata in sync during streaming (for regeneration) + if regenerate: + row_idx = len(output['internal']) - 1 + key = f"assistant_{row_idx}" + current_idx = output['metadata'][key]['current_version_index'] + output['metadata'][key]['versions'][current_idx].update({ + 'content': output['internal'][row_idx][1], + 'visible_content': output['visible'][row_idx][1] + }) + + if is_stream: + yield output output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True) - # Add the newly generated response as a version (only for regeneration) + # Final sync for version metadata (in case streaming was disabled) if regenerate: row_idx = len(output['internal']) - 1 - add_message_version(output, "assistant", row_idx, is_current=True) + key = f"assistant_{row_idx}" + current_idx = output['metadata'][key]['current_version_index'] + output['metadata'][key]['versions'][current_idx].update({ + 'content': output['internal'][row_idx][1], + 'visible_content': output['visible'][row_idx][1] + }) yield output From 724147ffabce95b5d20528b83b6e44c1523d58f0 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 10:49:29 -0700 Subject: [PATCH 151/164] Better detect when no model is available --- modules/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/utils.py b/modules/utils.py index 0e8bdd18..577c55b8 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -74,7 +74,7 @@ def natural_keys(text): def check_model_loaded(): if shared.model_name == 'None' or shared.model is None: - if len(get_available_models()) <= 1: + if len(get_available_models()) == 0: error_msg = "No model is loaded.\n\nTo get started:\n1) Place a GGUF file in your user_data/models folder\n2) Go to the Model tab and select it" logger.error(error_msg) return False, error_msg From e7129f9dbefbe87fa4c425b5873f80cbddaf7cf0 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 12:45:53 -0700 Subject: [PATCH 152/164] Prevent footer buttons below last assistant message from always appearing --- js/main.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/js/main.js b/js/main.js index 48bb8632..ea3ff46a 100644 --- a/js/main.js +++ b/js/main.js @@ -171,7 +171,6 @@ const observer = new MutationObserver(function(mutations) { document.getElementById("Generate").style.display = "flex"; } - doSyntaxHighlighting(); if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) { @@ -184,7 +183,7 @@ const observer = new MutationObserver(function(mutations) { const lastChild = messagesContainer?.lastElementChild; const prevSibling = lastChild?.previousElementSibling; if (lastChild && prevSibling) { - lastChild.style.minHeight = `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px))`; + lastChild.style.setProperty("margin-bottom", `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px)`, "important"); } } }); From aff41f3482bc7045334b0d81ac514723fdbd4f97 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 12:53:41 -0700 Subject: [PATCH 153/164] Update README --- README.md | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 361584f8..daf409d0 100644 --- a/README.md +++ b/README.md @@ -189,13 +189,13 @@ usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [-- [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] - [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] + [--mlock] [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N] [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT] - [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner] - [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] - [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] - [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--api] [--public-api] - [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] + [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] + [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] + [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] + [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] + [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] Text generation web UI @@ -217,7 +217,7 @@ Basic settings: --idle-timeout IDLE_TIMEOUT Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again. Model loader: - --loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ, + --loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM. Transformers/Accelerate: @@ -248,16 +248,18 @@ llama.cpp: --batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval. --no-mmap Prevent mmap from being used. --mlock Force the system to keep the model in RAM. - --n-gpu-layers N_GPU_LAYERS Number of layers to offload to the GPU. + --gpu-layers N, --n-gpu-layers N Number of layers to offload to the GPU. --tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40. --numa Activate NUMA task allocation for llama.cpp. --no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance. --row-split Split the model by rows across GPUs. This may improve multi-gpu performance. - --extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU" + --extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU" --streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed. -Context and cache management: +Context and cache: --ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens. + --cache-type N, --cache_type N KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits + separately, e.g. q4_q8). Speculative decoding: --model-draft MODEL_DRAFT Path to the draft model for speculative decoding. @@ -276,15 +278,9 @@ ExLlamaV2: --num_experts_per_token N Number of experts to use for generation. Applies to MoE models like Mixtral. --enable_tp Enable Tensor Parallelism (TP) in ExLlamaV2. -HQQ: - --hqq-backend HQQ_BACKEND Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN. - TensorRT-LLM: --cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet. -Cache: - --cache_type CACHE_TYPE KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4. - DeepSpeed: --deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration. --nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading. @@ -307,6 +303,7 @@ Gradio: --ssl-certfile SSL_CERTFILE The path to the SSL certificate cert file. --subpath SUBPATH Customize the subpath for gradio, use with reverse proxy --old-colors Use the legacy Gradio colors, before the December/2024 update. + --portable Hide features not available in portable mode like training. API: --api Enable the API extension. From f59998d2680f346038320b536617c4738c393947 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 13:08:48 -0700 Subject: [PATCH 154/164] Don't limit the number of prompt characters printed with --verbose --- modules/text_generation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/text_generation.py b/modules/text_generation.py index 962311df..1fd6d810 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -505,11 +505,11 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N return -def print_prompt(prompt, max_chars=2000): +def print_prompt(prompt, max_chars=-1): DARK_YELLOW = "\033[38;5;3m" RESET = "\033[0m" - if len(prompt) > max_chars: + if max_chars > 0 and len(prompt) > max_chars: half_chars = max_chars // 2 hidden_len = len(prompt[half_chars:-half_chars]) hidden_msg = f"{DARK_YELLOW}[...{hidden_len} characters hidden...]{RESET}" From a45a65213052dad02d696ed54af1b9f2ea82cd4a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 13:28:51 -0700 Subject: [PATCH 155/164] CSS fix --- js/main.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/js/main.js b/js/main.js index ea3ff46a..f23dc246 100644 --- a/js/main.js +++ b/js/main.js @@ -183,7 +183,10 @@ const observer = new MutationObserver(function(mutations) { const lastChild = messagesContainer?.lastElementChild; const prevSibling = lastChild?.previousElementSibling; if (lastChild && prevSibling) { - lastChild.style.setProperty("margin-bottom", `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px)`, "important"); + lastChild.style.setProperty("margin-bottom", + `max(0px, calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px))`, + "important" + ); } } }); From 8078c41ec67b96656d7e96128d915290b319e4f5 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 13:32:19 -0700 Subject: [PATCH 156/164] Revert "Bump llama.cpp" This reverts commit a8d02dec8f5e6a054a153b3b09425b51e090ae11. --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 4 ++-- requirements/full/requirements_amd_noavx2.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 16 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 5f61aff9..0eaf10da 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index a718b6ca..65f184bf 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -32,7 +32,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 5fddc623..d20b2ec3 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -32,7 +32,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 8e014445..2613d787 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -32,7 +32,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 77779f3d..af583b00 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -32,8 +32,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 79efc607..9bf2a37d 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -32,5 +32,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 8b29453e..1731448e 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -32,5 +32,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index f1f4a02e..fc481a1a 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -33,8 +33,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index adf50d9a..fdae681d 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 46b36791..a58f39f7 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 66052711..91ea3a6d 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -18,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 4013abcc..37e5aa40 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 41808854..dcb2884b 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index cff79ec6..8f1295bb 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 762b3fa3..858b4488 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index b425d305..569bae99 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -18,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.15.0/llama_cpp_binaries-0.15.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From dce02732a4caef16157ffbc288dfe079053e0bb4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 14:08:48 -0700 Subject: [PATCH 157/164] Fix timestamp issues when editing/swiping messages --- modules/chat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modules/chat.py b/modules/chat.py index 90d66687..6b3ff4fc 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -1508,11 +1508,12 @@ def handle_edit_message_click(state): if "versions" not in history['metadata'][key] or not history['metadata'][key]["versions"]: original_content = history['internal'][message_index][role_idx] original_visible = history['visible'][message_index][role_idx] + original_timestamp = history['metadata'][key].get('timestamp', get_current_timestamp()) history['metadata'][key]["versions"] = [{ "content": original_content, "visible_content": original_visible, - "timestamp": get_current_timestamp() + "timestamp": original_timestamp }] history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True) @@ -1564,6 +1565,7 @@ def handle_navigate_version_click(state): history['internal'][message_index][msg_content_idx] = version_to_load['content'] history['visible'][message_index][msg_content_idx] = version_to_load['visible_content'] metadata['current_version_index'] = new_idx + update_message_metadata(history['metadata'], role, message_index, timestamp=version_to_load['timestamp']) # Redraw and save html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) From acbcc12e7b19cc9f540d32b8d601ceefde77b7a1 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 14:11:21 -0700 Subject: [PATCH 158/164] Clean up --- modules/chat.py | 7 ++----- modules/ui_chat.py | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index 6b3ff4fc..e526a9a0 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -1493,7 +1493,7 @@ def handle_edit_message_click(state): if message_index >= len(history['internal']): html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html_output, gr.update()] # No unique_id change + return [history, html_output] role_idx = 0 if role == "user" else 1 @@ -1521,13 +1521,10 @@ def handle_edit_message_click(state): add_message_version(history, role, message_index, is_current=True) - # Since we are not branching, unique_id does not change. - past_chats_update = gr.update() - save_history(history, state['unique_id'], state['character_menu'], state['mode']) html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html_output, past_chats_update] + return [history, html_output] def handle_navigate_version_click(state): diff --git a/modules/ui_chat.py b/modules/ui_chat.py index df3d3929..d79aa523 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -314,7 +314,7 @@ def create_event_handlers(): shared.gradio['edit_message'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False) + chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False) # Save/delete a character shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False) From d1bfb08e8d4bab174e6b4467eff20f8a01a2a613 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 14:27:47 -0700 Subject: [PATCH 159/164] Improve the style of message editing --- css/main.css | 1 + 1 file changed, 1 insertion(+) diff --git a/css/main.css b/css/main.css index 7f9d4618..9685c863 100644 --- a/css/main.css +++ b/css/main.css @@ -1462,6 +1462,7 @@ strong { .editing-textarea { width: 100%; min-height: 200px; + max-height: 65vh; padding: 10px; border-radius: 5px; border: 1px solid #ccc; From 28e6bd4fcd8cd385cc92cc56c0c49fc474006147 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 29 May 2025 14:49:07 -0700 Subject: [PATCH 160/164] Revert "Update transformers requirement in /requirements/full (#7017)" This reverts commit cc9b7253c1216e5340da85cba9b65a13cf3526e9. --- requirements/full/requirements.txt | 2 +- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_amd_noavx2.txt | 2 +- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cpu_only.txt | 2 +- requirements/full/requirements_cpu_only_noavx2.txt | 2 +- requirements/full/requirements_noavx2.txt | 2 +- requirements/full/requirements_nowheels.txt | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 0eaf10da..2c322715 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.52.* +transformers==4.50.* tqdm wandb diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 65f184bf..6aeb325e 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.52.* +transformers==4.50.* tqdm wandb diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index d20b2ec3..3b052423 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.52.* +transformers==4.50.* tqdm wandb diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 2613d787..8c51459e 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.52.* +transformers==4.50.* tqdm wandb diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index af583b00..b9f15d45 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.52.* +transformers==4.50.* tqdm wandb diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 9bf2a37d..0877d968 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.52.* +transformers==4.50.* tqdm wandb diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 1731448e..cab78237 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.52.* +transformers==4.50.* tqdm wandb diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index fc481a1a..dfd42577 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -23,7 +23,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.52.* +transformers==4.50.* tqdm wandb diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 2ed8affa..5d9f84ce 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -22,7 +22,7 @@ safetensors==0.5.* scipy sentencepiece tensorboard -transformers==4.52.* +transformers==4.50.* tqdm wandb From 7c29879e795776ceb742a8ddb47fd3843069cf34 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 30 May 2025 11:17:47 -0700 Subject: [PATCH 161/164] Fix 'Start reply with' (closes #7033) --- modules/chat.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index e526a9a0..881f7330 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -806,9 +806,12 @@ def remove_last_message(history): return html.unescape(last[0]), history -def send_dummy_message(textbox, state): +def send_dummy_message(text, state): history = state['history'] - text = textbox['text'] + + # Handle both dict and string inputs + if isinstance(text, dict): + text = text['text'] # Initialize metadata if not present if 'metadata' not in history: @@ -822,9 +825,12 @@ def send_dummy_message(textbox, state): return history -def send_dummy_reply(textbox, state): +def send_dummy_reply(text, state): history = state['history'] - text = textbox['text'] + + # Handle both dict and string inputs + if isinstance(text, dict): + text = text['text'] # Initialize metadata if not present if 'metadata' not in history: From 298d4719c6c9545a701a9cc9e8f4efceb108599a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 30 May 2025 11:32:24 -0700 Subject: [PATCH 162/164] Multiple small style improvements --- css/main.css | 4 ++++ modules/ui.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/css/main.css b/css/main.css index 9685c863..967d94ed 100644 --- a/css/main.css +++ b/css/main.css @@ -1551,3 +1551,7 @@ strong { color: var(--body-text-color-subdued); margin-top: 4px; } + +button:focus { + outline: none; +} diff --git a/modules/ui.py b/modules/ui.py index a2662e14..9f4d67cb 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -71,6 +71,7 @@ if not shared.args.old_colors: block_background_fill_dark='transparent', block_border_color_dark='transparent', input_border_color_dark='var(--border-color-dark)', + input_border_color_focus_dark='var(--border-color-dark)', checkbox_border_color_dark='var(--border-color-dark)', border_color_primary_dark='var(--border-color-dark)', button_secondary_border_color_dark='var(--border-color-dark)', @@ -89,6 +90,8 @@ if not shared.args.old_colors: checkbox_label_shadow='none', block_shadow='none', block_shadow_dark='none', + input_shadow_focus='none', + input_shadow_focus_dark='none', button_large_radius='0.375rem', button_large_padding='6px 12px', input_radius='0.375rem', From 219f0a773166deeb0326c2874b29e66e382df524 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 30 May 2025 12:05:49 -0700 Subject: [PATCH 163/164] Fix exllamav3_hf models failing to unload (closes #7031) --- modules/exllamav3_hf.py | 17 +++++++++++++++++ modules/models.py | 3 +++ 2 files changed, 20 insertions(+) diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py index 417df473..1254ff5d 100644 --- a/modules/exllamav3_hf.py +++ b/modules/exllamav3_hf.py @@ -245,3 +245,20 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path) return Exllamav3HF(pretrained_model_name_or_path) + + def unload(self): + """Properly unload the ExllamaV3 model and free GPU memory.""" + if hasattr(self, 'ex_model') and self.ex_model is not None: + self.ex_model.unload() + self.ex_model = None + + if hasattr(self, 'ex_cache') and self.ex_cache is not None: + self.ex_cache = None + + # Clean up any additional ExllamaV3 resources + if hasattr(self, 'past_seq'): + self.past_seq = None + if hasattr(self, 'past_seq_negative'): + self.past_seq_negative = None + if hasattr(self, 'ex_cache_negative'): + self.ex_cache_negative = None diff --git a/modules/models.py b/modules/models.py index 4218d58c..d329ae3c 100644 --- a/modules/models.py +++ b/modules/models.py @@ -116,10 +116,13 @@ def unload_model(keep_model_name=False): return is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer') + if shared.args.loader == 'ExLlamav3_HF': + shared.model.unload() shared.model = shared.tokenizer = None shared.lora_names = [] shared.model_dirty_from_training = False + if not is_llamacpp: from modules.torch_utils import clear_torch_cache clear_torch_cache() From 15f466ca3f8255f2566f016db8d7b8fd9ebef3f4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 30 May 2025 15:49:57 -0700 Subject: [PATCH 164/164] Update README --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index daf409d0..55df33d2 100644 --- a/README.md +++ b/README.md @@ -14,18 +14,18 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)). - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory. -- **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents. -- **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation. -- Advanced chat management: Edit messages, navigate between message versions, and branch conversations at any point. +- 100% offline and private, with zero telemetry, external resources, or remote update requests. - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats. -- Automatic GPU layers for GGUF models (on NVIDIA GPUs). -- UI that resembles the original ChatGPT style. -- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`. -- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these. +- **File attachments**: Upload text files and PDF documents to talk about their contents. +- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation. +- Aesthetic UI with dark and light themes. +- `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters. +- Edit messages, navigate between message versions, and branch conversations at any point. - Multiple sampling parameters and generation options for sophisticated text generation control. -- Switch between different models easily in the UI without restarting, with fine control over settings. +- Switch between different models in the UI without restarting. +- Automatic GPU layers for GGUF models (on NVIDIA GPUs). +- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. - OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples). -- 100% offline and private, with zero telemetry, external resources, or remote update requests. Web search is optional and user-controlled. - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details. ## How to install