diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml index fb9e61b0..571cbac0 100644 --- a/.github/workflows/build-portable-release-cuda.yml +++ b/.github/workflows/build-portable-release-cuda.yml @@ -102,6 +102,8 @@ jobs: shell: bash run: | rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker + allowed=("character_bias" "gallery" "openai" "sd_api_pictures") + find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf # Define common variables CUDA_VERSION="${{ matrix.cuda }}" diff --git a/.github/workflows/build-portable-release-vulkan.yml b/.github/workflows/build-portable-release-vulkan.yml index 8de29791..4e88d4d9 100644 --- a/.github/workflows/build-portable-release-vulkan.yml +++ b/.github/workflows/build-portable-release-vulkan.yml @@ -101,6 +101,8 @@ jobs: shell: bash run: | rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker + allowed=("character_bias" "gallery" "openai" "sd_api_pictures") + find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf # Define common variables AVX_SUPPORT="${{ matrix.avx }}" diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml index bdf96cec..6910ce2c 100644 --- a/.github/workflows/build-portable-release.yml +++ b/.github/workflows/build-portable-release.yml @@ -101,6 +101,8 @@ jobs: shell: bash run: | rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker + allowed=("character_bias" "gallery" "openai" "sd_api_pictures") + find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf # Define common variables AVX_SUPPORT="${{ matrix.avx }}" diff --git a/README.md b/README.md index 4b541b9e..55df33d2 100644 --- a/README.md +++ b/README.md @@ -12,18 +12,20 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## Features -- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2). - - [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is also supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile). - - Additional quantization libraries like [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) can be used with the Transformers loader if you install them manually. -- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for llama.cpp GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment. -- UI that resembles the original ChatGPT style. -- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats. -- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`. -- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these. -- Multiple sampling parameters and generation options for sophisticated text generation control. -- Switch between different models easily in the UI without restarting, with fine control over settings. -- OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples). +- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)). +- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory. - 100% offline and private, with zero telemetry, external resources, or remote update requests. +- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats. +- **File attachments**: Upload text files and PDF documents to talk about their contents. +- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation. +- Aesthetic UI with dark and light themes. +- `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters. +- Edit messages, navigate between message versions, and branch conversations at any point. +- Multiple sampling parameters and generation options for sophisticated text generation control. +- Switch between different models in the UI without restarting. +- Automatic GPU layers for GGUF models (on NVIDIA GPUs). +- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. +- OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples). - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details. ## How to install @@ -44,7 +46,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again. -You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`. +You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, or `update_wizard_macos.sh`.
@@ -55,12 +57,12 @@ Setup details and information about installing manually The script uses Miniconda to set up a Conda environment in the `installer_files` folder. -If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, `cmd_macos.sh`, or `cmd_wsl.bat`. +If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, or `cmd_macos.sh`. * There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root. * To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts. * For additional instructions about AMD and WSL setup, consult [the documentation](https://github.com/oobabooga/text-generation-webui/wiki). -* For automated installation, you can use the `GPU_CHOICE`, `USE_CUDA118`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`. +* For automated installation, you can use the `GPU_CHOICE`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`. ### Manual installation using Conda @@ -90,7 +92,7 @@ conda activate textgen |--------|---------|---------| | Linux/WSL | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` | | Linux/WSL | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu` | -| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.1` | +| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.2.4` | | MacOS + MPS | Any | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` | | Windows | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` | | Windows | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` | @@ -146,14 +148,14 @@ The `requirements*.txt` above contain various wheels precompiled through GitHub For NVIDIA GPU: ln -s docker/{nvidia/Dockerfile,nvidia/docker-compose.yml,.dockerignore} . For AMD GPU: -ln -s docker/{amd/Dockerfile,intel/docker-compose.yml,.dockerignore} . +ln -s docker/{amd/Dockerfile,amd/docker-compose.yml,.dockerignore} . For Intel GPU: ln -s docker/{intel/Dockerfile,amd/docker-compose.yml,.dockerignore} . For CPU only ln -s docker/{cpu/Dockerfile,cpu/docker-compose.yml,.dockerignore} . cp docker/.env.example .env #Create logs/cache dir : -mkdir -p logs cache +mkdir -p user_data/logs user_data/cache # Edit .env and set: # TORCH_CUDA_ARCH_LIST based on your GPU model # APP_RUNTIME_GID your host user's group id (run `id -g` in a terminal) @@ -187,13 +189,13 @@ usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [-- [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] - [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] + [--mlock] [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N] [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT] - [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner] - [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] - [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] - [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--api] [--public-api] - [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] + [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] + [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] + [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] + [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] + [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui] Text generation web UI @@ -215,7 +217,7 @@ Basic settings: --idle-timeout IDLE_TIMEOUT Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again. Model loader: - --loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ, + --loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM. Transformers/Accelerate: @@ -246,16 +248,18 @@ llama.cpp: --batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval. --no-mmap Prevent mmap from being used. --mlock Force the system to keep the model in RAM. - --n-gpu-layers N_GPU_LAYERS Number of layers to offload to the GPU. + --gpu-layers N, --n-gpu-layers N Number of layers to offload to the GPU. --tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40. --numa Activate NUMA task allocation for llama.cpp. --no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance. --row-split Split the model by rows across GPUs. This may improve multi-gpu performance. - --extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU" + --extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU" --streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed. -Context and cache management: +Context and cache: --ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens. + --cache-type N, --cache_type N KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits + separately, e.g. q4_q8). Speculative decoding: --model-draft MODEL_DRAFT Path to the draft model for speculative decoding. @@ -274,15 +278,9 @@ ExLlamaV2: --num_experts_per_token N Number of experts to use for generation. Applies to MoE models like Mixtral. --enable_tp Enable Tensor Parallelism (TP) in ExLlamaV2. -HQQ: - --hqq-backend HQQ_BACKEND Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN. - TensorRT-LLM: --cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet. -Cache: - --cache_type CACHE_TYPE KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4. - DeepSpeed: --deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration. --nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading. @@ -305,6 +303,7 @@ Gradio: --ssl-certfile SSL_CERTFILE The path to the SSL certificate cert file. --subpath SUBPATH Customize the subpath for gradio, use with reverse proxy --old-colors Use the legacy Gradio colors, before the December/2024 update. + --portable Hide features not available in portable mode like training. API: --api Enable the API extension. diff --git a/css/chat_style-Dark.css b/css/chat_style-Dark.css index 368a2a16..6a4784cc 100644 --- a/css/chat_style-Dark.css +++ b/css/chat_style-Dark.css @@ -1,7 +1,9 @@ .message { display: grid; + align-items: start; grid-template-columns: 60px minmax(0, 1fr); - padding-bottom: 28px; + padding-bottom: 22px; + padding-top: 6px; font-size: 18px; font-family: Roboto, Arial, sans-serif; /* Modern font */ line-height: 1.5; @@ -102,6 +104,7 @@ @media screen and (width <= 688px) { .message { display: grid; + align-items: start; grid-template-columns: 60px minmax(0, 1fr); padding-bottom: 25px; font-size: 15px; diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css index 6404f41d..fbd47072 100644 --- a/css/chat_style-TheEncrypted777.css +++ b/css/chat_style-TheEncrypted777.css @@ -2,8 +2,10 @@ .message { display: grid; + align-items: start; grid-template-columns: 60px minmax(0, 1fr); - padding-bottom: 28px; + padding-bottom: 21px; + padding-top: 7px; font-size: 18px; font-family: 'Noto Sans', Arial, sans-serif; line-height: 1.428571429; @@ -100,6 +102,7 @@ @media screen and (width <= 688px) { .message { display: grid; + align-items: start; grid-template-columns: 60px minmax(0, 1fr); padding-bottom: 25px; font-size: 15px; diff --git a/css/chat_style-cai-chat-square.css b/css/chat_style-cai-chat-square.css index 854fff60..291a1209 100644 --- a/css/chat_style-cai-chat-square.css +++ b/css/chat_style-cai-chat-square.css @@ -16,6 +16,7 @@ } .message { - padding-bottom: 2em; + padding-bottom: 1.5em; + padding-top: 0.5em; grid-template-columns: 70px minmax(0, 1fr); } diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css index 93276bd3..b06b1269 100644 --- a/css/chat_style-cai-chat.css +++ b/css/chat_style-cai-chat.css @@ -1,7 +1,9 @@ .message { display: grid; + align-items: start; grid-template-columns: 60px minmax(0, 1fr); - padding-bottom: 2em; + padding-bottom: 1.5em; + padding-top: 0.5em; font-size: 15px; font-family: 'Noto Sans', Helvetica, Arial, sans-serif; line-height: 22.5px !important; diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css index f0fd1578..65af5f7a 100644 --- a/css/chat_style-messenger.css +++ b/css/chat_style-messenger.css @@ -1,5 +1,6 @@ .message { - padding-bottom: 25px; + padding-bottom: 22px; + padding-top: 3px; font-size: 15px; font-family: 'Noto Sans', Helvetica, Arial, sans-serif; line-height: 1.428571429; diff --git a/css/chat_style-wpp.css b/css/chat_style-wpp.css index 30ca61f3..353201c2 100644 --- a/css/chat_style-wpp.css +++ b/css/chat_style-wpp.css @@ -1,5 +1,6 @@ .message { - padding-bottom: 25px; + padding-bottom: 22px; + padding-top: 3px; font-size: 15px; font-family: 'Noto Sans', Helvetica, Arial, sans-serif; line-height: 1.428571429; diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index 4613b380..6ad250aa 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -8,10 +8,6 @@ padding-top: 0 !important; } -.chat > .messages > :last-child { - margin-bottom: 1.7rem !important; -} - .chat .message-body p, .chat .message-body li { font-size: 1rem !important; line-height: 28px !important; @@ -46,7 +42,7 @@ } .chat .user-message { - background: #f5f5f5; + background: #f3f4f6; padding: 1.5rem 1rem; padding-bottom: 2rem; border-radius: 0; @@ -61,16 +57,16 @@ } .dark .chat .user-message { - background: transparent; + background: var(--light-gray); } .dark .chat .assistant-message { - background: var(--light-gray); + background: transparent; } .chat .user-message .text, .chat .assistant-message .text { - max-width: 645px; + max-width: 700px; margin-left: auto; margin-right: auto; } diff --git a/css/main.css b/css/main.css index d6e5ac83..967d94ed 100644 --- a/css/main.css +++ b/css/main.css @@ -1,11 +1,11 @@ :root { --darker-gray: #202123; - --dark-gray: #343541; - --light-gray: #444654; - --light-theme-gray: #f5f5f5; + --dark-gray: #2A2B32; + --light-gray: #373943; + --light-theme-gray: #f9fbff; --border-color-dark: #525252; --header-width: 112px; - --selected-item-color-dark: #32333e; + --selected-item-color-dark: #2E2F38; } @font-face { @@ -131,7 +131,7 @@ gradio-app > :first-child { } .header_bar { - box-shadow: 0 0 3px rgba(22 22 22 / 35%); + border-right: var(--input-border-width) solid var(--input-border-color); margin-bottom: 0; overflow-x: scroll; text-wrap: nowrap; @@ -265,7 +265,7 @@ button { .dark .pretty_scrollbar::-webkit-scrollbar-thumb, .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover { - background: #ccc; + background: rgb(255 255 255 / 10%); border-radius: 10px; } @@ -389,8 +389,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { .chat { margin-left: auto; margin-right: auto; - min-height: var(--chat-height); - overflow-y: auto; + flex: 1; + overflow-y: hidden; display: flex; flex-direction: column; word-break: break-word; @@ -401,10 +401,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } .chat-parent { - height: calc(100dvh - 98px - var(--input-delta)); + flex: 1; overflow: auto !important; border-radius: 0 !important; - margin-bottom: var(--input-delta) !important; } .chat-parent .prose { @@ -420,14 +419,22 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { padding-right: 1rem; } +.chat .message .timestamp { + font-size: 0.7em; + display: inline-block; + font-weight: normal; + opacity: 0.7; + margin-left: 5px; +} + .chat-parent.bigchat { - height: calc(100dvh - 98px - var(--input-delta)) !important; - margin-bottom: var(--input-delta) !important; + flex: 1; } .chat > .messages { display: flex; flex-direction: column; + min-height: calc(100vh - 102px); } .chat > .messages > :first-child { @@ -546,7 +553,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { border-radius: 5px; font-size: 82%; padding: 1px 3px; - background: white !important; + background: #f3f4f6 !important; color: #1f2328; } @@ -560,18 +567,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { padding: 15px; } -.message-body :not(pre) > code::before { - content: "`"; -} - -.message-body :not(pre) > code::after { - content: "`"; -} - .message-body :not(pre) > code { white-space: normal !important; font-weight: bold; - font-family: unset; + font-size: 0.95em; + font-family: Menlo,"Roboto Mono","Courier New",Courier,monospace,Inter,sans-serif; + padding: .15rem .3rem; + background-color: #ececec; +} + +.dark .message-body :not(pre) > code { + background-color: rgb(255 255 255 / 10%); } #chat-input { @@ -582,7 +588,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } #chat-input textarea { + background: #f3f4f6; padding: 0.65rem 2.5rem; + border: 0; + box-shadow: 0; + border-radius: 8px; } #chat-input textarea::placeholder { @@ -602,9 +612,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { display: none; } +#chat-input .submit-button { + display: none; +} + +#chat-input .upload-button { + margin-right: 16px; + margin-bottom: 7px; + background: transparent; +} + .chat-input-positioned { - position: absolute; - bottom: 0; max-width: 54rem; left: 50%; transform: translateX(-50%); @@ -744,7 +762,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { .hover-menu button { width: 100%; - background: transparent !important; + background: white !important; border-radius: 0 !important; justify-content: space-between; margin: 0 !important; @@ -760,7 +778,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } .hover-menu button:hover { - background: var(--button-secondary-background-fill-hover) !important; + background: #dbeafe !important; +} + +.dark .hover-menu button:hover { + background: var(--selected-item-color-dark) !important; } .transparent-substring { @@ -789,6 +811,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } #chat-input-container { + display: flex; + flex-direction: column; min-width: 0 !important; } @@ -798,9 +822,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } #chat-input-row { - padding-bottom: 1.5em; - padding-left: 1rem; - padding-right: 1rem; + padding: 1rem; + padding-top: 0; } #chat-input-row.bigchat { @@ -808,27 +831,22 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } #chat-col { - padding-bottom: 100px; + height: 100dvh; + display: flex; + flex-direction: column; + padding-bottom: 0; + gap: 0; } @media screen and (width <= 924px) { #chat-col { - padding-bottom: 100px; margin-top: 32px; - position: relative; /* Ensure positioning for the pseudo-element */ - } - - .chat-parent { - height: calc(100dvh - 98px - var(--input-delta) - 32px); - } - - .chat-parent.bigchat { - height: calc(100dvh - 98px - var(--input-delta) - 32px) !important; + height: calc(100dvh - 32px); } } #chat-col.bigchat { - padding-bottom: 80px !important; + padding-bottom: 15px !important; } .message-body ol, .message-body ul { @@ -985,6 +1003,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { cursor: pointer; } +#past-chats .selected, +#past-chats label:hover { + background-color: #dbeafe !important; +} + #past-chats-buttons, #delete-chat-row, #rename-row { @@ -993,7 +1016,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { gap: 9px; } - #past-chats-row, #chat-controls { width: 260px; @@ -1111,12 +1133,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { color: #9ca3af; } -.dark .hover-menu { - background-color: var(--darker-gray); -} - .dark .hover-menu button { border-color: var(--border-color-primary); + background-color: var(--darker-gray) !important; } .dark #chat-controls, @@ -1125,8 +1144,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { border: 0 !important; } -.dark #past-chats .selected, -.dark #past-chats label:hover { +.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats .selected, +.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats label:hover { background-color: var(--selected-item-color-dark) !important; } @@ -1163,7 +1182,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } .header_bar button.selected { - background: #E0E0E0; + background: #dbeafe; } #chat-controls, @@ -1171,11 +1190,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { background-color: var(--light-theme-gray); } -#chat-controls { +.dark #chat-controls { border-left: 1px solid #d9d9d0; } -#past-chats-row { +.dark #past-chats-row { border-right: 1px solid #d9d9d0; } @@ -1236,42 +1255,31 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { position: relative; } -.footer-button { +/* New container for the buttons */ +.message-actions { position: absolute; + bottom: -23px; + left: 0; + display: flex; + gap: 5px; + opacity: 0; + transition: opacity 0.2s; +} + +.footer-button { padding: 0; margin: 0; border: none; border-radius: 3px; cursor: pointer; - opacity: 0; display: flex; align-items: center; - transition: opacity 0.2s; + justify-content: center; } -.footer-button.footer-copy-button { - bottom: -23px; - left: 0; -} - -.footer-button.footer-refresh-button { - bottom: -23px; - left: 25px; -} - -.footer-button.footer-continue-button { - bottom: -23px; - left: 50px; -} - -.footer-button.footer-remove-button { - bottom: -23px; - left: 75px; -} - -.message:hover .footer-button, -.user-message:hover .footer-button, -.assistant-message:hover .footer-button { +.message:hover .message-actions, +.user-message:hover .message-actions, +.assistant-message:hover .message-actions { opacity: 1; } @@ -1362,6 +1370,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { contain: layout; } +.chat .message-body .thinking-content p, +.chat .message-body .thinking-content li { + font-size: 15px !important; +} + /* Animation for opening thinking blocks */ @keyframes fadeIn { from { opacity: 0; } @@ -1382,3 +1395,163 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { 50% { opacity: 1; } 100% { opacity: 0.6; } } + +strong { + font-weight: bold; +} + +.min.svelte-1ybaih5 { + min-height: 0; +} + +#vram-info .value { + color: #008d00; +} + +.dark #vram-info .value { + color: #07ff07; +} + +.message-attachments { + display: flex; + flex-wrap: wrap; + gap: 8px; + margin-top: 8px; + padding-bottom: 6px; +} + +.attachment-box { + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + padding: 8px; + background: rgb(0 0 0 / 5%); + border-radius: 6px; + border: 1px solid rgb(0 0 0 / 10%); + min-width: 80px; + max-width: 120px; +} + +.attachment-icon { + margin-bottom: 4px; + color: #555; +} + +.attachment-name { + font-size: 0.8em; + text-align: center; + word-break: break-word; + overflow: hidden; + text-overflow: ellipsis; + display: -webkit-box; + -webkit-line-clamp: 2; + -webkit-box-orient: vertical; +} + +.dark .attachment-box { + background: rgb(255 255 255 / 5%); + border: 1px solid rgb(255 255 255 / 10%); +} + +.dark .attachment-icon { + color: #ccc; +} + +/* Message Editing Styles */ +.editing-textarea { + width: 100%; + min-height: 200px; + max-height: 65vh; + padding: 10px; + border-radius: 5px; + border: 1px solid #ccc; + background-color: var(--light-theme-gray); + font-family: inherit; + font-size: inherit; + resize: vertical; +} + +.dark .editing-textarea { + border: 1px solid var(--border-color-dark); + background-color: var(--darker-gray); +} + +.editing-textarea:focus { + outline: none; + border-color: var(--selected-item-color-dark); +} + +.edit-controls-container { + margin-top: 0; + display: flex; + gap: 8px; + padding-bottom: 8px; +} + +.edit-control-button { + padding: 6px 12px; + border: 1px solid #ccc; + border-radius: 4px; + cursor: pointer; + background-color: #f8f9fa; + color: #212529; + font-size: 12px; + margin: 0; +} + +.dark .edit-control-button { + border: 1px solid var(--border-color-dark); + background-color: var(--light-gray); + color: #efefef; +} + +/* --- Simple Version Navigation --- */ +.version-navigation { + position: absolute; + bottom: -23px; + right: 0; + display: flex; + align-items: center; + gap: 5px; + opacity: 0; + transition: opacity 0.2s; +} + +.message:hover .version-navigation, +.user-message:hover .version-navigation, +.assistant-message:hover .version-navigation { + opacity: 1; +} + +.version-nav-button { + padding: 2px 6px; + font-size: 12px; + min-width: auto; +} + +.version-nav-button[disabled] { + opacity: 0.3; + cursor: not-allowed; +} + +.version-position { + font-size: 11px; + color: currentcolor; + font-family: monospace; + min-width: 35px; + text-align: center; + opacity: 0.8; + user-select: none; +} + +.token-display { + font-family: monospace; + font-size: 13px; + color: var(--body-text-color-subdued); + margin-top: 4px; +} + +button:focus { + outline: none; +} diff --git a/docker/amd/Dockerfile b/docker/amd/Dockerfile index 66e5863c..c23083f7 100644 --- a/docker/amd/Dockerfile +++ b/docker/amd/Dockerfile @@ -14,7 +14,7 @@ WORKDIR /home/app/ RUN git clone https://github.com/oobabooga/text-generation-webui.git WORKDIR /home/app/text-generation-webui RUN GPU_CHOICE=B LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose -COPY CMD_FLAGS.txt /home/app/text-generation-webui/ +COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005} WORKDIR /home/app/text-generation-webui # set umask to ensure group read / write at runtime diff --git a/docker/amd/docker-compose.yml b/docker/amd/docker-compose.yml index 4709ae94..a727ca3e 100644 --- a/docker/amd/docker-compose.yml +++ b/docker/amd/docker-compose.yml @@ -22,7 +22,7 @@ services: TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-} APP_GID: ${APP_GID:-6972} - APP_UID: ${APP_UID-6972} + APP_UID: ${APP_UID:-6972} env_file: .env user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}" ports: @@ -41,14 +41,4 @@ services: security_opt: - seccomp=unconfined volumes: - - ./cache:/home/app/text-generation-webui/cache - - ./characters:/home/app/text-generation-webui/characters - - ./extensions:/home/app/text-generation-webui/extensions - - ./loras:/home/app/text-generation-webui/loras - - ./logs:/home/app/text-generation-webui/logs - - ./models:/home/app/text-generation-webui/models - - ./presets:/home/app/text-generation-webui/presets - - ./prompts:/home/app/text-generation-webui/prompts - - ./softprompts:/home/app/text-generation-webui/softprompts - - ./training:/home/app/text-generation-webui/training - - ./cloudflared:/etc/cloudflared + - ./user_data:/home/app/text-generation-webui/user_data diff --git a/docker/cpu/docker-compose.yml b/docker/cpu/docker-compose.yml index c9d415ae..9aba314a 100644 --- a/docker/cpu/docker-compose.yml +++ b/docker/cpu/docker-compose.yml @@ -22,7 +22,7 @@ services: TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-} APP_GID: ${APP_GID:-6972} - APP_UID: ${APP_UID-6972} + APP_UID: ${APP_UID:-6972} env_file: .env user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}" ports: diff --git a/docker/intel/Dockerfile b/docker/intel/Dockerfile index cab62442..4a709803 100644 --- a/docker/intel/Dockerfile +++ b/docker/intel/Dockerfile @@ -14,7 +14,7 @@ WORKDIR /home/app/ RUN git clone https://github.com/oobabooga/text-generation-webui.git WORKDIR /home/app/text-generation-webui RUN GPU_CHOICE=D LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose -COPY CMD_FLAGS.txt /home/app/text-generation-webui/ +COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005} # set umask to ensure group read / write at runtime WORKDIR /home/app/text-generation-webui diff --git a/docker/intel/docker-compose.yml b/docker/intel/docker-compose.yml index 31e9dde0..bb48dd22 100644 --- a/docker/intel/docker-compose.yml +++ b/docker/intel/docker-compose.yml @@ -22,7 +22,7 @@ services: TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-} APP_GID: ${APP_GID:-6972} - APP_UID: ${APP_UID-6972} + APP_UID: ${APP_UID:-6972} env_file: .env user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}" ports: @@ -41,12 +41,4 @@ services: security_opt: - seccomp=unconfined volumes: - - ./characters:/home/app/text-generation-webui/characters - - ./extensions:/home/app/text-generation-webui/extensions - - ./loras:/home/app/text-generation-webui/loras - - ./models:/home/app/text-generation-webui/models - - ./presets:/home/app/text-generation-webui/presets - - ./prompts:/home/app/text-generation-webui/prompts - - ./softprompts:/home/app/text-generation-webui/softprompts - - ./training:/home/app/text-generation-webui/training - - ./cloudflared:/etc/cloudflared + - ./user_data:/home/app/text-generation-webui/user_data diff --git a/docker/nvidia/Dockerfile b/docker/nvidia/Dockerfile index 900a4329..82594a26 100644 --- a/docker/nvidia/Dockerfile +++ b/docker/nvidia/Dockerfile @@ -14,7 +14,7 @@ WORKDIR /home/app/ RUN git clone https://github.com/oobabooga/text-generation-webui.git WORKDIR /home/app/text-generation-webui RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose -COPY CMD_FLAGS.txt /home/app/text-generation-webui/ +COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005} WORKDIR /home/app/text-generation-webui # set umask to ensure group read / write at runtime diff --git a/docker/nvidia/docker-compose.yml b/docker/nvidia/docker-compose.yml index 835dd838..23d5cacc 100644 --- a/docker/nvidia/docker-compose.yml +++ b/docker/nvidia/docker-compose.yml @@ -22,7 +22,7 @@ services: TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-} APP_GID: ${APP_GID:-6972} - APP_UID: ${APP_UID-6972} + APP_UID: ${APP_UID:-6972} env_file: .env user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}" ports: @@ -31,17 +31,7 @@ services: stdin_open: true tty: true volumes: - - ./cache:/home/app/text-generation-webui/cache - - ./characters:/home/app/text-generation-webui/characters - - ./extensions:/home/app/text-generation-webui/extensions - - ./loras:/home/app/text-generation-webui/loras - - ./logs:/home/app/text-generation-webui/logs - - ./models:/home/app/text-generation-webui/models - - ./presets:/home/app/text-generation-webui/presets - - ./prompts:/home/app/text-generation-webui/prompts - - ./softprompts:/home/app/text-generation-webui/softprompts - - ./training:/home/app/text-generation-webui/training - - ./cloudflared:/etc/cloudflared + - ./user_data:/home/app/text-generation-webui/user_data deploy: resources: reservations: diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md index 364c6b09..db9befed 100644 --- a/docs/12 - OpenAI API.md +++ b/docs/12 - OpenAI API.md @@ -257,6 +257,85 @@ headers = { in any of the examples above. +#### Tool/Function Calling Example + +You need to use a model with tools support. The prompt will be automatically formatted using the model's Jinja2 template. + +Request: + +``` +curl http://127.0.0.1:5000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": "What time is it currently in New York City?" + } + ], + "tools": [ + { + "type": "function", + "function": { + "name": "get_current_time", + "description": "Get current time in a specific timezones", + "parameters": { + "type": "object", + "required": ["timezone"], + "properties": { + "timezone": { + "type": "string", + "description": "IANA timezone name (e.g., America/New_York, Europe/London). Use Europe/Berlin as local timezone if no timezone provided by the user." + } + } + } + } + } + ] + }' +``` + +Sample response: + +``` +{ + "id": "chatcmpl-1746532051477984256", + "object": "chat.completion", + "created": 1746532051, + "model": "qwen2.5-coder-14b-instruct-q4_k_m.gguf", + "choices": [ + { + "index": 0, + "finish_reason": "tool_calls", + "message": { + "role": "assistant", + "content": "```xml\n\n{\n \"name\": \"get_current_time\",\n \"arguments\": {\n \"timezone\": \"America/New_York\"\n }\n}\n\n```" + }, + "tool_calls": [ + { + "type": "function", + "function": { + "name": "get_current_time", + "arguments": "{\"timezone\": \"America/New_York\"}" + }, + "id": "call_52ij07mh", + "index": "0" + } + ] + } + ], + "usage": { + "prompt_tokens": 224, + "completion_tokens": 38, + "total_tokens": 262 + } +} +``` + ### Environment variables The following environment variables can be used (they take precedence over everything else): diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py index 75e2cc11..5181b18b 100644 --- a/extensions/openai/completions.py +++ b/extensions/openai/completions.py @@ -1,16 +1,14 @@ -import base64 import copy -import re +import json import time from collections import deque -from io import BytesIO -import requests import tiktoken -from PIL import Image +from pydantic import ValidationError from extensions.openai.errors import InvalidRequestError -from extensions.openai.utils import debug_msg +from extensions.openai.typing import ToolDefinition +from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall from modules import shared from modules.chat import ( generate_chat_prompt, @@ -96,72 +94,32 @@ def convert_history(history): user_input_last = True system_message = "" - # Multimodal: convert OpenAI format to multimodal extension format - if any('content' in entry and isinstance(entry['content'], list) for entry in history): - new_history = [] - for entry in history: - if isinstance(entry['content'], list): - for item in entry['content']: - if not isinstance(item, dict): - continue - - image_url = None - content = None - if item['type'] == 'image_url' and isinstance(item['image_url'], dict): - image_url = item['image_url']['url'] - elif item['type'] == 'text' and isinstance(item['text'], str): - content = item['text'] - if image_url: - new_history.append({"image_url": image_url, "role": "user"}) - if content: - new_history.append({"content": content, "role": "user"}) - else: - new_history.append(entry) - - history = new_history - for entry in history: - if "image_url" in entry: - image_url = entry['image_url'] - if "base64" in image_url: - image_url = re.sub('^data:image/.+;base64,', '', image_url) - img = Image.open(BytesIO(base64.b64decode(image_url))) - else: - try: - my_res = requests.get(image_url) - img = Image.open(BytesIO(my_res.content)) - except Exception: - raise 'Image cannot be loaded from the URL!' - - buffered = BytesIO() - if img.mode in ("RGBA", "P"): - img = img.convert("RGB") - - img.save(buffered, format="JPEG") - img_str = base64.b64encode(buffered.getvalue()).decode('utf-8') - content = f'' - else: - content = entry["content"] - + content = entry["content"] role = entry["role"] if role == "user": user_input = content user_input_last = True if current_message: - chat_dialogue.append([current_message, '']) + chat_dialogue.append([current_message, '', '']) current_message = "" current_message = content elif role == "assistant": + if "tool_calls" in entry and isinstance(entry["tool_calls"], list) and len(entry["tool_calls"]) > 0 and content.strip() == "": + continue # skip tool calls current_reply = content user_input_last = False if current_message: - chat_dialogue.append([current_message, current_reply]) + chat_dialogue.append([current_message, current_reply, '']) current_message = "" current_reply = "" else: - chat_dialogue.append(['', current_reply]) + chat_dialogue.append(['', current_reply, '']) + elif role == "tool": + user_input_last = False + chat_dialogue.append(['', '', content]) elif role == "system": system_message += f"\n{content}" if system_message else content @@ -181,6 +139,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p if 'messages' not in body: raise InvalidRequestError(message="messages is required", param='messages') + tools = None + if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and len(body['tools']) > 0: + tools = validateTools(body['tools']) # raises InvalidRequestError if validation fails + messages = body['messages'] for m in messages: if 'role' not in m: @@ -238,6 +200,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p 'custom_system_message': custom_system_message, 'chat_template_str': chat_template_str, 'chat-instruct_command': chat_instruct_command, + 'tools': tools, 'history': history, 'stream': stream }) @@ -250,7 +213,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p requested_model = generate_params.pop('model') logprob_proc = generate_params.pop('logprob_proc', None) - def chat_streaming_chunk(content): + def chat_streaming_chunk(content, chunk_tool_calls=None): # begin streaming chunk = { "id": cmpl_id, @@ -260,7 +223,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p resp_list: [{ "index": 0, "finish_reason": None, - "delta": {'role': 'assistant', 'content': content}, + "delta": {'role': 'assistant', 'content': content, 'tool_calls': chunk_tool_calls}, }], } @@ -269,6 +232,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]} # else: # chunk[resp_list][0]["logprobs"] = None + return chunk # generate reply ####################################### @@ -277,8 +241,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p yield {'prompt': prompt} return - debug_msg({'prompt': prompt, 'generate_params': generate_params}) - if stream: yield chat_streaming_chunk('') @@ -288,8 +250,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p answer = '' seen_content = '' + tool_calls = [] + end_last_tool_call = 0 + supported_tools = [x["function"]["name"] for x in tools] if tools is not None else None + for a in generator: answer = a['internal'][-1][1] + + if supported_tools is not None: + tool_call = parseToolCall(answer[end_last_tool_call:], supported_tools) if len(answer) > 0 else [] + if len(tool_call) > 0: + for tc in tool_call: + tc["id"] = getToolCallId() + tc["index"] = str(len(tool_calls)) + tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"]) + tool_calls.append(tc) + end_last_tool_call = len(answer) + if stream: len_seen = len(seen_content) new_content = answer[len_seen:] @@ -297,18 +274,25 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet. continue - seen_content = answer chunk = chat_streaming_chunk(new_content) + + seen_content = answer yield chunk + # stop generation if tool_calls were generated previously + if len(tool_calls) > 0: + break + token_count = len(encode(prompt)[0]) completion_token_count = len(encode(answer)[0]) stop_reason = "stop" + if len(tool_calls) > 0: + stop_reason = "tool_calls" if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']: stop_reason = "length" if stream: - chunk = chat_streaming_chunk('') + chunk = chat_streaming_chunk('', tool_calls) chunk[resp_list][0]['finish_reason'] = stop_reason chunk['usage'] = { "prompt_tokens": token_count, @@ -326,7 +310,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p resp_list: [{ "index": 0, "finish_reason": stop_reason, - "message": {"role": "assistant", "content": answer} + "message": {"role": "assistant", "content": answer}, + "tool_calls": tool_calls }], "usage": { "prompt_tokens": token_count, @@ -515,3 +500,19 @@ def completions(body: dict, is_legacy: bool = False) -> dict: def stream_completions(body: dict, is_legacy: bool = False): for resp in completions_common(body, is_legacy, stream=True): yield resp + + +def validateTools(tools: list[dict]): + # Validate each tool definition in the JSON array + valid_tools = None + for idx in range(len(tools)): + tool = tools[idx] + try: + tool_definition = ToolDefinition(**tool) + if valid_tools is None: + valid_tools = [] + valid_tools.append(tool) + except ValidationError: + raise InvalidRequestError(message=f"Invalid tool specification at index {idx}.", param='tools') + + return valid_tools diff --git a/extensions/openai/script.py b/extensions/openai/script.py index a995da9d..24bcd69d 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -14,6 +14,7 @@ from fastapi.requests import Request from fastapi.responses import JSONResponse from pydub import AudioSegment from sse_starlette import EventSourceResponse +from starlette.concurrency import iterate_in_threadpool import extensions.openai.completions as OAIcompletions import extensions.openai.images as OAIimages @@ -114,18 +115,28 @@ async def openai_completions(request: Request, request_data: CompletionRequest): if request_data.stream: async def generator(): async with streaming_semaphore: - response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy) - for resp in response: - disconnected = await request.is_disconnected() - if disconnected: - break + try: + response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy) + async for resp in iterate_in_threadpool(response): + disconnected = await request.is_disconnected() + if disconnected: + break - yield {"data": json.dumps(resp)} + yield {"data": json.dumps(resp)} + finally: + stop_everything_event() + response.close() + return return EventSourceResponse(generator()) # SSE streaming else: - response = OAIcompletions.completions(to_dict(request_data), is_legacy=is_legacy) + response = await asyncio.to_thread( + OAIcompletions.completions, + to_dict(request_data), + is_legacy=is_legacy + ) + return JSONResponse(response) @@ -137,18 +148,28 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion if request_data.stream: async def generator(): async with streaming_semaphore: - response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy) - for resp in response: - disconnected = await request.is_disconnected() - if disconnected: - break + try: + response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy) + async for resp in iterate_in_threadpool(response): + disconnected = await request.is_disconnected() + if disconnected: + break - yield {"data": json.dumps(resp)} + yield {"data": json.dumps(resp)} + finally: + stop_everything_event() + response.close() + return return EventSourceResponse(generator()) # SSE streaming else: - response = OAIcompletions.chat_completions(to_dict(request_data), is_legacy=is_legacy) + response = await asyncio.to_thread( + OAIcompletions.chat_completions, + to_dict(request_data), + is_legacy=is_legacy + ) + return JSONResponse(response) @@ -436,7 +457,7 @@ def run_server(): # Start server logging.getLogger("uvicorn.error").propagate = False - uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile) + uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False) def setup(): diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py index b1979cbc..b28ebb4e 100644 --- a/extensions/openai/typing.py +++ b/extensions/openai/typing.py @@ -1,8 +1,8 @@ import json import time -from typing import Dict, List +from typing import Dict, List, Optional -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, validator class GenerationOptions(BaseModel): @@ -54,6 +54,48 @@ class GenerationOptions(BaseModel): grammar_string: str = "" +class ToolDefinition(BaseModel): + function: 'ToolFunction' + type: str + + +class ToolFunction(BaseModel): + description: str + name: str + parameters: 'ToolParameters' + + +class ToolParameters(BaseModel): + properties: Optional[Dict[str, 'ToolProperty']] = None + required: Optional[list[str]] = None + type: str + description: Optional[str] = None + + +class ToolProperty(BaseModel): + description: Optional[str] = None + type: Optional[str] = None # we are faced with definitions like anyOf, e.g. {'type': 'function', 'function': {'name': 'git_create_branch', 'description': 'Creates a new branch from an optional base branch', 'parameters': {'type': 'object', 'properties': {'repo_path': {'title': 'Repo Path', 'type': 'string'}, 'branch_name': {'title': 'Branch Name', 'type': 'string'}, 'base_branch': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'default': None, 'title': 'Base Branch'}}, 'required': ['repo_path', 'branch_name'], 'title': 'GitCreateBranch'}}} + + +class FunctionCall(BaseModel): + name: str + arguments: Optional[str] = None + parameters: Optional[str] = None + + @validator('arguments', allow_reuse=True) + def checkPropertyArgsOrParams(cls, v, values, **kwargs): + if not v and not values.get('parameters'): + raise ValueError("At least one of 'arguments' or 'parameters' must be provided as property in FunctionCall type") + return v + + +class ToolCall(BaseModel): + id: str + index: int + type: str + function: FunctionCall + + class CompletionRequestParams(BaseModel): model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.") prompt: str | List[str] @@ -92,6 +134,7 @@ class ChatCompletionRequestParams(BaseModel): frequency_penalty: float | None = 0 function_call: str | dict | None = Field(default=None, description="Unused parameter.") functions: List[dict] | None = Field(default=None, description="Unused parameter.") + tools: List[dict] | None = Field(default=None, description="Tools signatures passed via MCP.") logit_bias: dict | None = None max_tokens: int | None = None n: int | None = Field(default=1, description="Unused parameter.") diff --git a/extensions/openai/utils.py b/extensions/openai/utils.py index 2b414769..9a1de2e7 100644 --- a/extensions/openai/utils.py +++ b/extensions/openai/utils.py @@ -1,5 +1,8 @@ import base64 +import json import os +import random +import re import time import traceback from typing import Callable, Optional @@ -52,3 +55,94 @@ def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_star time.sleep(3) raise Exception('Could not start cloudflared.') + + +def getToolCallId() -> str: + letter_bytes = "abcdefghijklmnopqrstuvwxyz0123456789" + b = [random.choice(letter_bytes) for _ in range(8)] + return "call_" + "".join(b).lower() + + +def checkAndSanitizeToolCallCandidate(candidate_dict: dict, tool_names: list[str]): + # check if property 'function' exists and is a dictionary, otherwise adapt dict + if 'function' not in candidate_dict and 'name' in candidate_dict and isinstance(candidate_dict['name'], str): + candidate_dict = {"type": "function", "function": candidate_dict} + if 'function' in candidate_dict and isinstance(candidate_dict['function'], str): + candidate_dict['name'] = candidate_dict['function'] + del candidate_dict['function'] + candidate_dict = {"type": "function", "function": candidate_dict} + if 'function' in candidate_dict and isinstance(candidate_dict['function'], dict): + # check if 'name' exists within 'function' and is part of known tools + if 'name' in candidate_dict['function'] and candidate_dict['function']['name'] in tool_names: + candidate_dict["type"] = "function" # ensure required property 'type' exists and has the right value + # map property 'parameters' used by some older models to 'arguments' + if "arguments" not in candidate_dict["function"] and "parameters" in candidate_dict["function"]: + candidate_dict["function"]["arguments"] = candidate_dict["function"]["parameters"] + del candidate_dict["function"]["parameters"] + return candidate_dict + return None + + +def parseToolCall(answer: str, tool_names: list[str]): + matches = [] + + # abort on very short answers to save computation cycles + if len(answer) < 10: + return matches + + # Define the regex pattern to find the JSON content wrapped in , , , and other tags observed from various models + patterns = [r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)"] + + for pattern in patterns: + for match in re.finditer(pattern, answer, re.DOTALL): + # print(match.group(2)) + if match.group(2) is None: + continue + # remove backtick wraps if present + candidate = re.sub(r"^```(json|xml|python[^\n]*)\n", "", match.group(2).strip()) + candidate = re.sub(r"```$", "", candidate.strip()) + # unwrap inner tags + candidate = re.sub(pattern, r"\2", candidate.strip(), flags=re.DOTALL) + # llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually + if re.search(r"\}\s*\n\s*\{", candidate) is not None: + candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate) + if not candidate.strip().startswith("["): + candidate = "[" + candidate + "]" + + candidates = [] + try: + # parse the candidate JSON into a dictionary + candidates = json.loads(candidate) + if not isinstance(candidates, list): + candidates = [candidates] + except json.JSONDecodeError: + # Ignore invalid JSON silently + continue + + for candidate_dict in candidates: + checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names) + if checked_candidate is not None: + matches.append(checked_candidate) + + # last resort if nothing has been mapped: LLM might have produced plain json tool call without xml-like tags + if len(matches) == 0: + try: + candidate = answer + # llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually + if re.search(r"\}\s*\n\s*\{", candidate) is not None: + candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate) + if not candidate.strip().startswith("["): + candidate = "[" + candidate + "]" + # parse the candidate JSON into a dictionary + candidates = json.loads(candidate) + if not isinstance(candidates, list): + candidates = [candidates] + for candidate_dict in candidates: + checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names) + if checked_candidate is not None: + matches.append(checked_candidate) + except json.JSONDecodeError: + # Ignore invalid JSON silently + pass + + return matches diff --git a/extensions/superboogav2/chromadb.py b/extensions/superboogav2/chromadb.py index 6e93dd92..9344e25c 100644 --- a/extensions/superboogav2/chromadb.py +++ b/extensions/superboogav2/chromadb.py @@ -1,10 +1,11 @@ import math import random import threading -import torch + import chromadb import numpy as np import posthog +import torch from chromadb.config import Settings from chromadb.utils import embedding_functions @@ -292,6 +293,8 @@ class ChromaCollector(): for doc in documents: doc_tokens = encode(doc)[0] + if isinstance(doc_tokens, np.ndarray): + doc_tokens = doc_tokens.tolist() doc_token_count = len(doc_tokens) if current_token_count + doc_token_count > max_token_count: # If adding this document would exceed the max token count, diff --git a/js/global_scope_js.js b/js/global_scope_js.js index 29d2d8bd..3274f47e 100644 --- a/js/global_scope_js.js +++ b/js/global_scope_js.js @@ -1,3 +1,7 @@ +// ------------------------------------------------- +// Event handlers +// ------------------------------------------------- + function copyToClipboard(element) { if (!element) return; @@ -18,6 +22,201 @@ function copyToClipboard(element) { }); } +function branchHere(element) { + if (!element) return; + + const messageElement = element.closest(".message, .user-message, .assistant-message"); + if (!messageElement) return; + + const index = messageElement.getAttribute("data-index"); + if (!index) return; + + const branchIndexInput = document.getElementById("Branch-index").querySelector("input"); + if (!branchIndexInput) { + console.error("Element with ID 'Branch-index' not found."); + return; + } + const branchButton = document.getElementById("Branch"); + + if (!branchButton) { + console.error("Required element 'Branch' not found."); + return; + } + + branchIndexInput.value = index; + + // Trigger any 'change' or 'input' events Gradio might be listening for + const event = new Event("input", { bubbles: true }); + branchIndexInput.dispatchEvent(event); + + branchButton.click(); +} + +// ------------------------------------------------- +// Message Editing Functions +// ------------------------------------------------- + +function editHere(buttonElement) { + if (!buttonElement) return; + + const messageElement = buttonElement.closest(".message, .user-message, .assistant-message"); + if (!messageElement) return; + + const messageBody = messageElement.querySelector(".message-body"); + if (!messageBody) return; + + // If already editing, focus the textarea + const existingTextarea = messageBody.querySelector(".editing-textarea"); + if (existingTextarea) { + existingTextarea.focus(); + return; + } + + // Determine role based on message element - handle different chat modes + const isUserMessage = messageElement.classList.contains("user-message") || + messageElement.querySelector(".text-you") !== null || + messageElement.querySelector(".circle-you") !== null; + + startEditing(messageElement, messageBody, isUserMessage); +} + +function startEditing(messageElement, messageBody, isUserMessage) { + const rawText = messageElement.getAttribute("data-raw") || messageBody.textContent; + const originalHTML = messageBody.innerHTML; + + // Create editing interface + const editingInterface = createEditingInterface(rawText); + + // Replace message content + messageBody.innerHTML = ""; + messageBody.appendChild(editingInterface.textarea); + messageBody.appendChild(editingInterface.controls); + + editingInterface.textarea.focus(); + editingInterface.textarea.setSelectionRange(rawText.length, rawText.length); + + // Setup event handlers + setupEditingHandlers(editingInterface.textarea, messageElement, originalHTML, messageBody, isUserMessage); +} + +function createEditingInterface(text) { + const textarea = document.createElement("textarea"); + textarea.value = text; + textarea.className = "editing-textarea"; + textarea.rows = Math.max(3, text.split("\n").length); + + const controls = document.createElement("div"); + controls.className = "edit-controls-container"; + + const saveButton = document.createElement("button"); + saveButton.textContent = "Save"; + saveButton.className = "edit-control-button"; + saveButton.type = "button"; + + const cancelButton = document.createElement("button"); + cancelButton.textContent = "Cancel"; + cancelButton.className = "edit-control-button edit-cancel-button"; + cancelButton.type = "button"; + + controls.appendChild(saveButton); + controls.appendChild(cancelButton); + + return { textarea, controls, saveButton, cancelButton }; +} + +function setupEditingHandlers(textarea, messageElement, originalHTML, messageBody, isUserMessage) { + const saveButton = messageBody.querySelector(".edit-control-button:not(.edit-cancel-button)"); + const cancelButton = messageBody.querySelector(".edit-cancel-button"); + + const submitEdit = () => { + const index = messageElement.getAttribute("data-index"); + if (!index || !submitMessageEdit(index, textarea.value, isUserMessage)) { + cancelEdit(); + } + }; + + const cancelEdit = () => { + messageBody.innerHTML = originalHTML; + }; + + // Event handlers + saveButton.onclick = submitEdit; + cancelButton.onclick = cancelEdit; + + textarea.onkeydown = (e) => { + if (e.key === "Enter" && !e.shiftKey) { + e.preventDefault(); + submitEdit(); + } else if (e.key === "Escape") { + e.preventDefault(); + cancelEdit(); + } + }; +} + +function submitMessageEdit(index, newText, isUserMessage) { + const editIndexInput = document.getElementById("Edit-message-index")?.querySelector("input"); + const editTextInput = document.getElementById("Edit-message-text")?.querySelector("textarea"); + const editRoleInput = document.getElementById("Edit-message-role")?.querySelector("textarea"); + const editButton = document.getElementById("Edit-message"); + + if (!editIndexInput || !editTextInput || !editRoleInput || !editButton) { + console.error("Edit elements not found"); + return false; + } + + editIndexInput.value = index; + editTextInput.value = newText; + editRoleInput.value = isUserMessage ? "user" : "assistant"; + + editIndexInput.dispatchEvent(new Event("input", { bubbles: true })); + editTextInput.dispatchEvent(new Event("input", { bubbles: true })); + editRoleInput.dispatchEvent(new Event("input", { bubbles: true })); + + editButton.click(); + return true; +} + +function navigateVersion(element, direction) { + if (!element) return; + + const messageElement = element.closest(".message, .user-message, .assistant-message"); + if (!messageElement) return; + + const index = messageElement.getAttribute("data-index"); + if (!index) return; + + // Determine role based on message element classes + let role = "assistant"; // Default role + if (messageElement.classList.contains("user-message") || + messageElement.querySelector(".text-you") || + messageElement.querySelector(".circle-you")) { + role = "user"; + } + + const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input"); + const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea"); + const roleInput = document.getElementById("Navigate-message-role")?.querySelector("textarea"); + const navigateButton = document.getElementById("Navigate-version"); + + if (!indexInput || !directionInput || !roleInput || !navigateButton) { + console.error("Navigation control elements (index, direction, role, or button) not found."); + return; + } + + indexInput.value = index; + directionInput.value = direction; + roleInput.value = role; + + // Trigger 'input' events for Gradio to pick up changes + const event = new Event("input", { bubbles: true }); + indexInput.dispatchEvent(event); + directionInput.dispatchEvent(event); + roleInput.dispatchEvent(event); + + navigateButton.click(); +} + function regenerateClick() { document.getElementById("Regenerate").click(); } diff --git a/js/main.js b/js/main.js index 33b7d6bd..f23dc246 100644 --- a/js/main.js +++ b/js/main.js @@ -1,3 +1,7 @@ +// ------------------------------------------------ +// Main +// ------------------------------------------------ + let main_parent = document.getElementById("chat-tab").parentNode; let extensions = document.getElementById("extensions"); @@ -39,9 +43,24 @@ document.querySelector(".header_bar").addEventListener("click", function(event) //------------------------------------------------ // Keyboard shortcuts //------------------------------------------------ + +// --- Helper functions --- // +function isModifiedKeyboardEvent() { + return (event instanceof KeyboardEvent && + event.shiftKey || + event.ctrlKey || + event.altKey || + event.metaKey); +} + +function isFocusedOnEditableTextbox() { + if (event.target.tagName === "INPUT" || event.target.tagName === "TEXTAREA") { + return !!event.target.value; + } +} + let previousTabId = "chat-tab-button"; document.addEventListener("keydown", function(event) { - // Stop generation on Esc pressed if (event.key === "Escape") { // Find the element with id 'stop' and click it @@ -49,10 +68,15 @@ document.addEventListener("keydown", function(event) { if (stopButton) { stopButton.click(); } + return; + } + + if (!document.querySelector("#chat-tab").checkVisibility() ) { + return; } // Show chat controls on Ctrl + S - else if (event.ctrlKey && event.key == "s") { + if (event.ctrlKey && event.key == "s") { event.preventDefault(); var showControlsElement = document.getElementById("show-controls"); @@ -82,24 +106,29 @@ document.addEventListener("keydown", function(event) { document.getElementById("Remove-last").click(); } - // Copy last on Ctrl + Shift + K - else if (event.ctrlKey && event.shiftKey && event.key === "K") { - event.preventDefault(); - document.getElementById("Copy-last").click(); - } - - // Replace last on Ctrl + Shift + L - else if (event.ctrlKey && event.shiftKey && event.key === "L") { - event.preventDefault(); - document.getElementById("Replace-last").click(); - } - // Impersonate on Ctrl + Shift + M else if (event.ctrlKey && event.shiftKey && event.key === "M") { event.preventDefault(); document.getElementById("Impersonate").click(); } + // --- Simple version navigation --- // + if (!isFocusedOnEditableTextbox()) { + // Version navigation on Arrow keys (horizontal) + if (!isModifiedKeyboardEvent() && event.key === "ArrowLeft") { + event.preventDefault(); + navigateLastAssistantMessage("left"); + } + + else if (!isModifiedKeyboardEvent() && event.key === "ArrowRight") { + event.preventDefault(); + if (!navigateLastAssistantMessage("right")) { + // If can't navigate right (last version), regenerate + document.getElementById("Regenerate").click(); + } + } + } + }); //------------------------------------------------ @@ -132,8 +161,6 @@ targetElement.addEventListener("scroll", function() { // Create a MutationObserver instance const observer = new MutationObserver(function(mutations) { - updateCssProperties(); - if (targetElement.classList.contains("_generating")) { typing.parentNode.classList.add("visible-dots"); document.getElementById("stop").style.display = "flex"; @@ -144,12 +171,24 @@ const observer = new MutationObserver(function(mutations) { document.getElementById("Generate").style.display = "flex"; } - doSyntaxHighlighting(); if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) { targetElement.scrollTop = targetElement.scrollHeight; } + + const chatElement = document.getElementById("chat"); + if (chatElement && chatElement.getAttribute("data-mode") === "instruct") { + const messagesContainer = chatElement.querySelector(".messages"); + const lastChild = messagesContainer?.lastElementChild; + const prevSibling = lastChild?.previousElementSibling; + if (lastChild && prevSibling) { + lastChild.style.setProperty("margin-bottom", + `max(0px, calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px))`, + "important" + ); + } + } }); // Configure the observer to watch for changes in the subtree and attributes @@ -436,38 +475,6 @@ const chatInput = document.querySelector("#chat-input textarea"); // Variables to store current dimensions let currentChatInputHeight = chatInput.clientHeight; -// Update chat layout based on chat and input dimensions -function updateCssProperties() { - const chatInputHeight = chatInput.clientHeight; - - // Check if the chat container is visible - if (chatContainer.clientHeight > 0) { - const chatContainerParentHeight = chatContainer.parentNode.clientHeight; - const newChatHeight = `${chatContainerParentHeight - chatInputHeight - 80}px`; - - document.documentElement.style.setProperty("--chat-height", newChatHeight); - document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`); - - // Adjust scrollTop based on input height change - if (chatInputHeight !== currentChatInputHeight) { - const deltaHeight = chatInputHeight - currentChatInputHeight; - if (!isScrolled && deltaHeight < 0) { - chatContainer.scrollTop = chatContainer.scrollHeight; - } else { - chatContainer.scrollTop += deltaHeight; - } - - currentChatInputHeight = chatInputHeight; - } - } -} - -// Observe textarea size changes and call update function -new ResizeObserver(updateCssProperties).observe(document.querySelector("#chat-input textarea")); - -// Handle changes in window size -window.addEventListener("resize", updateCssProperties); - //------------------------------------------------ // Focus on the rename text area when it becomes visible //------------------------------------------------ @@ -720,7 +727,7 @@ function isMobile() { // Function to initialize sidebars function initializeSidebars() { const isOnMobile = isMobile(); - + if (isOnMobile) { // Mobile state: Hide sidebars and set closed states [pastChatsRow, chatControlsRow, headerBar].forEach(el => { @@ -813,3 +820,55 @@ function createMobileTopBar() { } createMobileTopBar(); + +//------------------------------------------------ +// Simple Navigation Functions +//------------------------------------------------ + +function navigateLastAssistantMessage(direction) { + const chat = document.querySelector("#chat"); + if (!chat) return false; + + const messages = chat.querySelectorAll("[data-index]"); + if (messages.length === 0) return false; + + // Find the last assistant message (starting from the end) + let lastAssistantMessage = null; + for (let i = messages.length - 1; i >= 0; i--) { + const msg = messages[i]; + if ( + msg.classList.contains("assistant-message") || + msg.querySelector(".circle-bot") || + msg.querySelector(".text-bot") + ) { + lastAssistantMessage = msg; + break; + } + } + + if (!lastAssistantMessage) return false; + + const buttons = lastAssistantMessage.querySelectorAll(".version-nav-button"); + + for (let i = 0; i < buttons.length; i++) { + const button = buttons[i]; + const onclick = button.getAttribute("onclick"); + const disabled = button.hasAttribute("disabled"); + + const isLeft = onclick && onclick.includes("'left'"); + const isRight = onclick && onclick.includes("'right'"); + + if (!disabled) { + if (direction === "left" && isLeft) { + navigateVersion(button, direction); + return true; + } + if (direction === "right" && isRight) { + navigateVersion(button, direction); + return true; + } + } + } + + return false; +} diff --git a/modules/chat.py b/modules/chat.py index 4becb7f5..2db72f36 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -5,6 +5,7 @@ import html import json import pprint import re +import time from datetime import datetime from functools import partial from pathlib import Path @@ -30,12 +31,37 @@ from modules.text_generation import ( get_max_prompt_length ) from modules.utils import delete_file, get_available_characters, save_file +from modules.web_search import add_web_search_attachments def strftime_now(format): return datetime.now().strftime(format) +def get_current_timestamp(): + """Returns the current time in 24-hour format""" + return datetime.now().strftime('%b %d, %Y %H:%M') + + +def update_message_metadata(metadata_dict, role, index, **fields): + """ + Updates or adds metadata fields for a specific message. + + Args: + metadata_dict: The metadata dictionary + role: The role (user, assistant, etc) + index: The message index + **fields: Arbitrary metadata fields to update/add + """ + key = f"{role}_{index}" + if key not in metadata_dict: + metadata_dict[key] = {} + + # Update with provided fields + for field_name, field_value in fields.items(): + metadata_dict[key][field_name] = field_value + + jinja_env = ImmutableSandboxedEnvironment( trim_blocks=True, lstrip_blocks=True, @@ -132,7 +158,9 @@ def generate_chat_prompt(user_input, state, **kwargs): impersonate = kwargs.get('impersonate', False) _continue = kwargs.get('_continue', False) also_return_rows = kwargs.get('also_return_rows', False) - history = kwargs.get('history', state['history'])['internal'] + history_data = kwargs.get('history', state['history']) + history = history_data['internal'] + metadata = history_data.get('metadata', {}) # Templates chat_template_str = state['chat_template_str'] @@ -145,7 +173,7 @@ def generate_chat_prompt(user_input, state, **kwargs): instruct_renderer = partial( instruction_template.render, builtin_tools=None, - tools=None, + tools=state['tools'] if 'tools' in state else None, tools_in_user_message=False, add_generation_prompt=False ) @@ -171,18 +199,62 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.append({"role": "system", "content": context}) insert_pos = len(messages) - for user_msg, assistant_msg in reversed(history): - user_msg = user_msg.strip() - assistant_msg = assistant_msg.strip() + for i, entry in enumerate(reversed(history)): + user_msg = entry[0].strip() + assistant_msg = entry[1].strip() + tool_msg = entry[2].strip() if len(entry) > 2 else '' + + row_idx = len(history) - i - 1 + + if tool_msg: + messages.insert(insert_pos, {"role": "tool", "content": tool_msg}) if assistant_msg: messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg}) if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']: - messages.insert(insert_pos, {"role": "user", "content": user_msg}) + # Check for user message attachments in metadata + user_key = f"user_{row_idx}" + enhanced_user_msg = user_msg + + # Add attachment content if present + if user_key in metadata and "attachments" in metadata[user_key]: + attachments_text = "" + for attachment in metadata[user_key]["attachments"]: + filename = attachment.get("name", "file") + content = attachment.get("content", "") + attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + + if attachments_text: + enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}" + + messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg}) user_input = user_input.strip() - if user_input and not impersonate and not _continue: + + # Check if we have attachments even with empty input + has_attachments = False + if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0: + current_row_idx = len(history) + user_key = f"user_{current_row_idx}" + has_attachments = user_key in metadata and "attachments" in metadata[user_key] + + if (user_input or has_attachments) and not impersonate and not _continue: + # For the current user input being processed, check if we need to add attachments + if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0: + current_row_idx = len(history) + user_key = f"user_{current_row_idx}" + + if user_key in metadata and "attachments" in metadata[user_key]: + attachments_text = "" + for attachment in metadata[user_key]["attachments"]: + filename = attachment.get("name", "file") + content = attachment.get("content", "") + attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + + if attachments_text: + user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}" + messages.append({"role": "user", "content": user_input}) def make_prompt(messages): @@ -251,7 +323,6 @@ def generate_chat_prompt(user_input, state, **kwargs): # Resort to truncating the user input else: - user_message = messages[-1]['content'] # Bisect the truncation point @@ -288,6 +359,50 @@ def generate_chat_prompt(user_input, state, **kwargs): return prompt +def count_prompt_tokens(text_input, state): + """Count tokens for current history + input including attachments""" + if shared.tokenizer is None: + return "Tokenizer not available" + + try: + # Handle dict format with text and files + files = [] + if isinstance(text_input, dict): + files = text_input.get('files', []) + text = text_input.get('text', '') + else: + text = text_input + files = [] + + # Create temporary history copy to add attachments + temp_history = copy.deepcopy(state['history']) + if 'metadata' not in temp_history: + temp_history['metadata'] = {} + + # Process attachments if any + if files: + row_idx = len(temp_history['internal']) + for file_path in files: + add_message_attachment(temp_history, row_idx, file_path, is_user=True) + + # Create temp state with modified history + temp_state = copy.deepcopy(state) + temp_state['history'] = temp_history + + # Build prompt using existing logic + prompt = generate_chat_prompt(text, temp_state) + current_tokens = get_encoded_length(prompt) + max_tokens = temp_state['truncation_length'] + + percentage = (current_tokens / max_tokens) * 100 if max_tokens > 0 else 0 + + return f"History + Input:
{current_tokens:,} / {max_tokens:,} tokens ({percentage:.1f}%)" + + except Exception as e: + logger.error(f"Error counting tokens: {e}") + return f"Error: {str(e)}" + + def get_stopping_strings(state): stopping_strings = [] renderers = [] @@ -336,6 +451,114 @@ def get_stopping_strings(state): return result +def add_message_version(history, role, row_idx, is_current=True): + key = f"{role}_{row_idx}" + if 'metadata' not in history: + history['metadata'] = {} + if key not in history['metadata']: + history['metadata'][key] = {} + + if "versions" not in history['metadata'][key]: + history['metadata'][key]["versions"] = [] + + # Determine which index to use for content based on role + content_idx = 0 if role == 'user' else 1 + current_content = history['internal'][row_idx][content_idx] + current_visible = history['visible'][row_idx][content_idx] + + history['metadata'][key]["versions"].append({ + "content": current_content, + "visible_content": current_visible, + "timestamp": get_current_timestamp() + }) + + if is_current: + # Set the current_version_index to the newly added version (which is now the last one). + history['metadata'][key]["current_version_index"] = len(history['metadata'][key]["versions"]) - 1 + + +def add_message_attachment(history, row_idx, file_path, is_user=True): + """Add a file attachment to a message in history metadata""" + if 'metadata' not in history: + history['metadata'] = {} + + key = f"{'user' if is_user else 'assistant'}_{row_idx}" + + if key not in history['metadata']: + history['metadata'][key] = {"timestamp": get_current_timestamp()} + if "attachments" not in history['metadata'][key]: + history['metadata'][key]["attachments"] = [] + + # Get file info using pathlib + path = Path(file_path) + filename = path.name + file_extension = path.suffix.lower() + + try: + # Handle different file types + if file_extension == '.pdf': + # Process PDF file + content = extract_pdf_text(path) + file_type = "application/pdf" + else: + # Default handling for text files + with open(path, 'r', encoding='utf-8') as f: + content = f.read() + file_type = "text/plain" + + # Add attachment + attachment = { + "name": filename, + "type": file_type, + "content": content, + } + + history['metadata'][key]["attachments"].append(attachment) + return content # Return the content for reuse + except Exception as e: + logger.error(f"Error processing attachment {filename}: {e}") + return None + + +def extract_pdf_text(pdf_path): + """Extract text from a PDF file""" + import PyPDF2 + + text = "" + try: + with open(pdf_path, 'rb') as file: + pdf_reader = PyPDF2.PdfReader(file) + for page_num in range(len(pdf_reader.pages)): + page = pdf_reader.pages[page_num] + text += page.extract_text() + "\n\n" + + return text.strip() + except Exception as e: + logger.error(f"Error extracting text from PDF: {e}") + return f"[Error extracting PDF text: {str(e)}]" + + +def generate_search_query(user_message, state): + """Generate a search query from user message using the LLM""" + # Augment the user message with search instruction + augmented_message = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else." + + # Use a minimal state for search query generation but keep the full history + search_state = state.copy() + search_state['max_new_tokens'] = 64 + search_state['auto_max_new_tokens'] = False + search_state['enable_thinking'] = False + + # Generate the full prompt using existing history + augmented message + formatted_prompt = generate_chat_prompt(augmented_message, search_state) + + query = "" + for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True): + query = reply.strip() + + return query + + def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False): # Handle dict format with text and files files = [] @@ -509,16 +732,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess yield output -def impersonate_wrapper(text, state): +def impersonate_wrapper(textbox, state): + text = textbox['text'] static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) prompt = generate_chat_prompt('', state, impersonate=True) stopping_strings = get_stopping_strings(state) - yield text + '...', static_output + textbox['text'] = text + '...' + yield textbox, static_output reply = None for reply in generate_reply(prompt + text, state, stopping_strings=stopping_strings, is_chat=True): - yield (text + reply).lstrip(' '), static_output + textbox['text'] = (text + reply).lstrip(' ') + yield textbox, static_output if shared.stop_everything: return @@ -564,56 +790,81 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False): send_dummy_reply(state['start_with'], state) history = state['history'] + last_save_time = time.monotonic() + save_interval = 8 for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)): yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history + current_time = time.monotonic() + # Save on first iteration or if save_interval seconds have passed + if i == 0 or (current_time - last_save_time) >= save_interval: + save_history(history, state['unique_id'], state['character_menu'], state['mode']) + last_save_time = current_time + save_history(history, state['unique_id'], state['character_menu'], state['mode']) def remove_last_message(history): + if 'metadata' not in history: + history['metadata'] = {} + if len(history['visible']) > 0 and history['internal'][-1][0] != '<|BEGIN-VISIBLE-CHAT|>': + row_idx = len(history['internal']) - 1 last = history['visible'].pop() history['internal'].pop() + + # Remove metadata directly by known keys + if f"user_{row_idx}" in history['metadata']: + del history['metadata'][f"user_{row_idx}"] + if f"assistant_{row_idx}" in history['metadata']: + del history['metadata'][f"assistant_{row_idx}"] else: last = ['', ''] return html.unescape(last[0]), history -def send_last_reply_to_input(history): - if len(history['visible']) > 0: - return html.unescape(history['visible'][-1][1]) - else: - return '' - - -def replace_last_reply(text, state): - history = state['history'] - - if len(text.strip()) == 0: - return history - elif len(history['visible']) > 0: - history['visible'][-1][1] = html.escape(text) - history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True) - - return history - - def send_dummy_message(text, state): history = state['history'] + + # Handle both dict and string inputs + if isinstance(text, dict): + text = text['text'] + + # Initialize metadata if not present + if 'metadata' not in history: + history['metadata'] = {} + + row_idx = len(history['internal']) history['visible'].append([html.escape(text), '']) history['internal'].append([apply_extensions('input', text, state, is_chat=True), '']) + update_message_metadata(history['metadata'], "user", row_idx, timestamp=get_current_timestamp()) + return history def send_dummy_reply(text, state): history = state['history'] + + # Handle both dict and string inputs + if isinstance(text, dict): + text = text['text'] + + # Initialize metadata if not present + if 'metadata' not in history: + history['metadata'] = {} + if len(history['visible']) > 0 and not history['visible'][-1][1] == '': + row_idx = len(history['internal']) history['visible'].append(['', '']) history['internal'].append(['', '']) + # We don't need to add system metadata + row_idx = len(history['internal']) - 1 history['visible'][-1][1] = html.escape(text) history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True) + update_message_metadata(history['metadata'], "assistant", row_idx, timestamp=get_current_timestamp()) + return history @@ -623,7 +874,8 @@ def redraw_html(history, name1, name2, mode, style, character, reset_cache=False def start_new_chat(state): mode = state['mode'] - history = {'internal': [], 'visible': []} + # Initialize with empty metadata dictionary + history = {'internal': [], 'visible': [], 'metadata': {}} if mode != 'instruct': greeting = replace_character_names(state['greeting'], state['name1'], state['name2']) @@ -631,6 +883,9 @@ def start_new_chat(state): history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]] history['visible'] += [['', apply_extensions('output', html.escape(greeting), state, is_chat=True)]] + # Add timestamp for assistant's greeting + update_message_metadata(history['metadata'], "assistant", 0, timestamp=get_current_timestamp()) + unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S') save_history(history, unique_id, state['character_menu'], state['mode']) @@ -811,6 +1066,16 @@ def load_history(unique_id, character, mode): 'visible': f['data_visible'] } + # Add metadata if it doesn't exist + if 'metadata' not in history: + history['metadata'] = {} + # Add placeholder timestamps for existing messages + for i, (user_msg, asst_msg) in enumerate(history['internal']): + if user_msg and user_msg != '<|BEGIN-VISIBLE-CHAT|>': + update_message_metadata(history['metadata'], "user", i, timestamp="") + if asst_msg: + update_message_metadata(history['metadata'], "assistant", i, timestamp="") + return history @@ -826,6 +1091,16 @@ def load_history_json(file, history): 'visible': f['data_visible'] } + # Add metadata if it doesn't exist + if 'metadata' not in history: + history['metadata'] = {} + # Add placeholder timestamps + for i, (user_msg, asst_msg) in enumerate(history['internal']): + if user_msg and user_msg != '<|BEGIN-VISIBLE-CHAT|>': + update_message_metadata(history['metadata'], "user", i, timestamp="") + if asst_msg: + update_message_metadata(history['metadata'], "assistant", i, timestamp="") + return history except: return history @@ -1147,20 +1422,12 @@ def my_yaml_output(data): return result -def handle_replace_last_reply_click(text, state): - history = replace_last_reply(text, state) - save_history(history, state['unique_id'], state['character_menu'], state['mode']) - html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - - return [history, html, ""] - - def handle_send_dummy_message_click(text, state): history = send_dummy_message(text, state) save_history(history, state['unique_id'], state['character_menu'], state['mode']) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html, ""] + return [history, html, {"text": "", "files": []}] def handle_send_dummy_reply_click(text, state): @@ -1168,7 +1435,7 @@ def handle_send_dummy_reply_click(text, state): save_history(history, state['unique_id'], state['character_menu'], state['mode']) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html, ""] + return [history, html, {"text": "", "files": []}] def handle_remove_last_click(state): @@ -1176,7 +1443,7 @@ def handle_remove_last_click(state): save_history(history, state['unique_id'], state['character_menu'], state['mode']) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html, last_input] + return [history, html, {"text": last_input, "files": []}] def handle_unique_id_select(state): @@ -1222,7 +1489,13 @@ def handle_delete_chat_confirm_click(state): def handle_branch_chat_click(state): - history = state['history'] + branch_from_index = state['branch_index'] + if branch_from_index == -1: + history = state['history'] + else: + history = state['history'] + history['visible'] = history['visible'][:branch_from_index + 1] + history['internal'] = history['internal'][:branch_from_index + 1] new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S') save_history(history, new_unique_id, state['character_menu'], state['mode']) @@ -1233,7 +1506,93 @@ def handle_branch_chat_click(state): past_chats_update = gr.update(choices=histories, value=new_unique_id) - return [history, html, past_chats_update] + return [history, html, past_chats_update, -1] + + +def handle_edit_message_click(state): + history = state['history'] + message_index = int(state['edit_message_index']) + new_text = state['edit_message_text'] + role = state['edit_message_role'] # "user" or "assistant" + + if message_index >= len(history['internal']): + html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + return [history, html_output] + + role_idx = 0 if role == "user" else 1 + + if 'metadata' not in history: + history['metadata'] = {} + + key = f"{role}_{message_index}" + if key not in history['metadata']: + history['metadata'][key] = {} + + # If no versions exist yet for this message, store the current (pre-edit) content as the first version. + if "versions" not in history['metadata'][key] or not history['metadata'][key]["versions"]: + original_content = history['internal'][message_index][role_idx] + original_visible = history['visible'][message_index][role_idx] + original_timestamp = history['metadata'][key].get('timestamp', get_current_timestamp()) + + history['metadata'][key]["versions"] = [{ + "content": original_content, + "visible_content": original_visible, + "timestamp": original_timestamp + }] + + history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True) + history['visible'][message_index][role_idx] = html.escape(new_text) + + add_message_version(history, role, message_index, is_current=True) + + save_history(history, state['unique_id'], state['character_menu'], state['mode']) + html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + + return [history, html_output] + + +def handle_navigate_version_click(state): + history = state['history'] + message_index = int(state['navigate_message_index']) + direction = state['navigate_direction'] + role = state['navigate_message_role'] + + if not role: + logger.error("Role not provided for version navigation.") + html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + return [history, html] + + key = f"{role}_{message_index}" + if 'metadata' not in history or key not in history['metadata'] or 'versions' not in history['metadata'][key]: + html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + return [history, html] + + metadata = history['metadata'][key] + versions = metadata['versions'] + # Default to the last version if current_version_index is not set + current_idx = metadata.get('current_version_index', len(versions) - 1 if versions else 0) + + if direction == 'left': + new_idx = max(0, current_idx - 1) + else: # right + new_idx = min(len(versions) - 1, current_idx + 1) + + if new_idx == current_idx: + html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + return [history, html] + + msg_content_idx = 0 if role == 'user' else 1 # 0 for user content, 1 for assistant content in the pair + version_to_load = versions[new_idx] + history['internal'][message_index][msg_content_idx] = version_to_load['content'] + history['visible'][message_index][msg_content_idx] = version_to_load['visible_content'] + metadata['current_version_index'] = new_idx + update_message_metadata(history['metadata'], role, message_index, timestamp=version_to_load['timestamp']) + + # Redraw and save + html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) + save_history(history, state['unique_id'], state['character_menu'], state['mode']) + + return [history, html] def handle_rename_chat_click(): @@ -1375,7 +1734,7 @@ def handle_your_picture_change(picture, state): def handle_send_instruction_click(state): state['mode'] = 'instruct' - state['history'] = {'internal': [], 'visible': []} + state['history'] = {'internal': [], 'visible': [], 'metadata': {}} output = generate_chat_prompt("Input", state) diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py index 12b22f64..1254ff5d 100644 --- a/modules/exllamav3_hf.py +++ b/modules/exllamav3_hf.py @@ -119,7 +119,7 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): reset = True # Maximum number of tokens to process in a single forward pass - max_chunk_size = 2048 + max_chunk_size = 256 # Make the forward call if labels is None: @@ -245,3 +245,20 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path) return Exllamav3HF(pretrained_model_name_or_path) + + def unload(self): + """Properly unload the ExllamaV3 model and free GPU memory.""" + if hasattr(self, 'ex_model') and self.ex_model is not None: + self.ex_model.unload() + self.ex_model = None + + if hasattr(self, 'ex_cache') and self.ex_cache is not None: + self.ex_cache = None + + # Clean up any additional ExllamaV3 resources + if hasattr(self, 'past_seq'): + self.past_seq = None + if hasattr(self, 'past_seq_negative'): + self.past_seq_negative = None + if hasattr(self, 'ex_cache_negative'): + self.ex_cache_negative = None diff --git a/modules/html_generator.py b/modules/html_generator.py index 67d15b6e..cbf3e19c 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -169,11 +169,7 @@ def convert_to_markdown(string, message_id=None): thinking_block = f'''
- - - - - + {info_svg_small} {title_text}
{thinking_html}
@@ -339,41 +335,164 @@ copy_svg = '''''' continue_svg = '''''' remove_svg = '''''' +branch_svg = '''''' +edit_svg = '''''' +info_svg = '''''' +info_svg_small = '''''' +attachment_svg = '''''' copy_button = f'' +branch_button = f'' +edit_button = f'' refresh_button = f'' continue_button = f'' remove_button = f'' +info_button = f'' + + +def format_message_timestamp(history, role, index): + """Get a formatted timestamp HTML span for a message if available""" + key = f"{role}_{index}" + if 'metadata' in history and key in history['metadata'] and history['metadata'][key].get('timestamp'): + timestamp = history['metadata'][key]['timestamp'] + return f"{timestamp}" + + return "" + + +def format_message_attachments(history, role, index): + """Get formatted HTML for message attachments if available""" + key = f"{role}_{index}" + if 'metadata' in history and key in history['metadata'] and 'attachments' in history['metadata'][key]: + attachments = history['metadata'][key]['attachments'] + if not attachments: + return "" + + attachments_html = '
' + for attachment in attachments: + name = html.escape(attachment["name"]) + + # Make clickable if URL exists + if "url" in attachment: + name = f'{name}' + + attachments_html += ( + f'
' + f'
{attachment_svg}
' + f'
{name}
' + f'
' + ) + attachments_html += '
' + return attachments_html + + return "" + + +def get_version_navigation_html(history, i, role): + """Generate simple navigation arrows for message versions""" + key = f"{role}_{i}" + metadata = history.get('metadata', {}) + + if key not in metadata or 'versions' not in metadata[key]: + return "" + + versions = metadata[key]['versions'] + # Default to the last version if current_version_index isn't set in metadata + current_idx = metadata[key].get('current_version_index', len(versions) - 1 if versions else 0) + + if len(versions) <= 1: + return "" + + left_disabled = ' disabled' if current_idx == 0 else '' + right_disabled = ' disabled' if current_idx >= len(versions) - 1 else '' + + left_arrow = f'' + right_arrow = f'' + position = f'{current_idx + 1}/{len(versions)}' + + return f'
{left_arrow}{position}{right_arrow}
' + + +def actions_html(history, i, role, info_message=""): + action_buttons = "" + version_nav_html = "" + + if role == "assistant": + action_buttons = ( + f'{copy_button}' + f'{edit_button}' + f'{refresh_button if i == len(history["visible"]) - 1 else ""}' + f'{continue_button if i == len(history["visible"]) - 1 else ""}' + f'{remove_button if i == len(history["visible"]) - 1 else ""}' + f'{branch_button}' + ) + + version_nav_html = get_version_navigation_html(history, i, "assistant") + elif role == "user": + action_buttons = ( + f'{copy_button}' + f'{edit_button}' + ) + + version_nav_html = get_version_navigation_html(history, i, "user") + + return (f'
' + f'{action_buttons}' + f'{info_message}' + f'
' + f'{version_nav_html}') def generate_instruct_html(history): - output = f'
' + output = f'
' for i in range(len(history['visible'])): row_visible = history['visible'][i] row_internal = history['internal'][i] converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible] + # Get timestamps + user_timestamp = format_message_timestamp(history, "user", i) + assistant_timestamp = format_message_timestamp(history, "assistant", i) + + # Get attachments + user_attachments = format_message_attachments(history, "user", i) + assistant_attachments = format_message_attachments(history, "assistant", i) + + # Create info buttons for timestamps if they exist + info_message_user = "" + if user_timestamp != "": + # Extract the timestamp value from the span + user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0] + info_message_user = info_button.replace("message", user_timestamp_value) + + info_message_assistant = "" + if assistant_timestamp != "": + # Extract the timestamp value from the span + assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0] + info_message_assistant = info_button.replace("message", assistant_timestamp_value) + if converted_visible[0]: # Don't display empty user messages output += ( f'
' + f'data-raw="{html.escape(row_internal[0], quote=True)}"' + f'data-index={i}>' f'
' f'
{converted_visible[0]}
' - f'{copy_button}' + f'{user_attachments}' + f'{actions_html(history, i, "user", info_message_user)}' f'
' f'
' ) output += ( f'
' + f'data-raw="{html.escape(row_internal[1], quote=True)}"' + f'data-index={i}>' f'
' f'
{converted_visible[1]}
' - f'{copy_button}' - f'{refresh_button if i == len(history["visible"]) - 1 else ""}' - f'{continue_button if i == len(history["visible"]) - 1 else ""}' - f'{remove_button if i == len(history["visible"]) - 1 else ""}' + f'{assistant_attachments}' + f'{actions_html(history, i, "assistant", info_message_assistant)}' f'
' f'
' ) @@ -401,30 +520,39 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache= row_internal = history['internal'][i] converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible] + # Get timestamps + user_timestamp = format_message_timestamp(history, "user", i) + assistant_timestamp = format_message_timestamp(history, "assistant", i) + + # Get attachments + user_attachments = format_message_attachments(history, "user", i) + assistant_attachments = format_message_attachments(history, "assistant", i) + if converted_visible[0]: # Don't display empty user messages output += ( f'
' + f'data-raw="{html.escape(row_internal[0], quote=True)}"' + f'data-index={i}>' f'
{img_me}
' f'
' - f'
{name1}
' + f'
{name1}{user_timestamp}
' f'
{converted_visible[0]}
' - f'{copy_button}' + f'{user_attachments}' + f'{actions_html(history, i, "user")}' f'
' f'
' ) output += ( f'
' + f'data-raw="{html.escape(row_internal[1], quote=True)}"' + f'data-index={i}>' f'
{img_bot}
' f'
' - f'
{name2}
' + f'
{name2}{assistant_timestamp}
' f'
{converted_visible[1]}
' - f'{copy_button}' - f'{refresh_button if i == len(history["visible"]) - 1 else ""}' - f'{continue_button if i == len(history["visible"]) - 1 else ""}' - f'{remove_button if i == len(history["visible"]) - 1 else ""}' + f'{assistant_attachments}' + f'{actions_html(history, i, "assistant")}' f'
' f'
' ) @@ -441,26 +569,48 @@ def generate_chat_html(history, name1, name2, reset_cache=False): row_internal = history['internal'][i] converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible] + # Get timestamps + user_timestamp = format_message_timestamp(history, "user", i) + assistant_timestamp = format_message_timestamp(history, "assistant", i) + + # Get attachments + user_attachments = format_message_attachments(history, "user", i) + assistant_attachments = format_message_attachments(history, "assistant", i) + + # Create info buttons for timestamps if they exist + info_message_user = "" + if user_timestamp != "": + # Extract the timestamp value from the span + user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0] + info_message_user = info_button.replace("message", user_timestamp_value) + + info_message_assistant = "" + if assistant_timestamp != "": + # Extract the timestamp value from the span + assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0] + info_message_assistant = info_button.replace("message", assistant_timestamp_value) + if converted_visible[0]: # Don't display empty user messages output += ( f'
' + f'data-raw="{html.escape(row_internal[0], quote=True)}"' + f'data-index={i}>' f'
' f'
{converted_visible[0]}
' - f'{copy_button}' + f'{user_attachments}' + f'{actions_html(history, i, "user", info_message_user)}' f'
' f'
' ) output += ( f'
' + f'data-raw="{html.escape(row_internal[1], quote=True)}"' + f'data-index={i}>' f'
' f'
{converted_visible[1]}
' - f'{copy_button}' - f'{refresh_button if i == len(history["visible"]) - 1 else ""}' - f'{continue_button if i == len(history["visible"]) - 1 else ""}' - f'{remove_button if i == len(history["visible"]) - 1 else ""}' + f'{assistant_attachments}' + f'{actions_html(history, i, "assistant", info_message_assistant)}' f'
' f'
' ) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index d9187db8..d695c74e 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -66,7 +66,7 @@ class LlamaServer: "top_k": state["top_k"], "top_p": state["top_p"], "min_p": state["min_p"], - "tfs_z": state["tfs"], + "top_n_sigma": state["top_n_sigma"] if state["top_n_sigma"] > 0 else -1, "typical_p": state["typical_p"], "repeat_penalty": state["repetition_penalty"], "repeat_last_n": state["repetition_penalty_range"], @@ -102,8 +102,10 @@ class LlamaServer: penalty_found = False for s in samplers: - if s.strip() in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]: + if s.strip() in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc"]: filtered_samplers.append(s.strip()) + elif s.strip() == "typical_p": + filtered_samplers.append("typ_p") elif not penalty_found and s.strip() == "repetition_penalty": filtered_samplers.append("penalties") penalty_found = True @@ -144,8 +146,9 @@ class LlamaServer: pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) print() - # Make a direct request with streaming enabled using a context manager - with self.session.post(url, json=payload, stream=True) as response: + # Make the generation request + response = self.session.post(url, json=payload, stream=True) + try: response.raise_for_status() # Raise an exception for HTTP errors full_text = "" @@ -182,6 +185,8 @@ class LlamaServer: print(f"JSON decode error: {e}") print(f"Problematic line: {line}") continue + finally: + response.close() def generate(self, prompt, state): output = "" @@ -210,14 +215,15 @@ class LlamaServer: pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) print() - response = self.session.post(url, json=payload) - result = response.json() + for retry in range(5): + response = self.session.post(url, json=payload) + result = response.json() - if "completion_probabilities" in result: - if use_samplers: - return result["completion_probabilities"][0]["top_probs"] - else: - return result["completion_probabilities"][0]["top_logprobs"] + if "completion_probabilities" in result: + if use_samplers: + return result["completion_probabilities"][0]["top_probs"] + else: + return result["completion_probabilities"][0]["top_logprobs"] else: raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}") @@ -255,9 +261,10 @@ class LlamaServer: self.server_path, "--model", self.model_path, "--ctx-size", str(shared.args.ctx_size), - "--n-gpu-layers", str(shared.args.n_gpu_layers), + "--gpu-layers", str(shared.args.gpu_layers), "--batch-size", str(shared.args.batch_size), "--port", str(self.port), + "--no-webui", ] if shared.args.flash_attn: @@ -278,8 +285,10 @@ class LlamaServer: cmd.append("--no-kv-offload") if shared.args.row_split: cmd += ["--split-mode", "row"] + cache_type = "fp16" if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types: cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type] + cache_type = shared.args.cache_type if shared.args.compress_pos_emb != 1: cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)] if shared.args.rope_freq_base > 0: @@ -316,9 +325,15 @@ class LlamaServer: for flag_item in extra_flags.split(','): if '=' in flag_item: flag, value = flag_item.split('=', 1) - cmd += [f"--{flag}", value] + if len(flag) <= 3: + cmd += [f"-{flag}", value] + else: + cmd += [f"--{flag}", value] else: - cmd.append(f"--{flag_item}") + if len(flag_item) <= 3: + cmd.append(f"-{flag_item}") + else: + cmd.append(f"--{flag_item}") env = os.environ.copy() if os.name == 'posix': @@ -333,6 +348,7 @@ class LlamaServer: print(' '.join(str(item) for item in cmd[1:])) print() + logger.info(f"Using gpu_layers={shared.args.gpu_layers} | ctx_size={shared.args.ctx_size} | cache_type={cache_type}") # Start the server with pipes for output self.process = subprocess.Popen( cmd, diff --git a/modules/loaders.py b/modules/loaders.py index 738198b1..6fbd2198 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -5,7 +5,7 @@ import gradio as gr loaders_and_params = OrderedDict({ 'llama.cpp': [ - 'n_gpu_layers', + 'gpu_layers', 'threads', 'threads_batch', 'batch_size', @@ -28,6 +28,7 @@ loaders_and_params = OrderedDict({ 'device_draft', 'ctx_size_draft', 'speculative_decoding_accordion', + 'vram_info', ], 'Transformers': [ 'gpu_split', @@ -84,17 +85,11 @@ loaders_and_params = OrderedDict({ 'no_flash_attn', 'no_xformers', 'no_sdpa', - 'exllamav2_info', 'model_draft', 'draft_max', 'ctx_size_draft', 'speculative_decoding_accordion', ], - 'HQQ': [ - 'hqq_backend', - 'trust_remote_code', - 'no_use_fast', - ], 'TensorRT-LLM': [ 'ctx_size', 'cpp_runner', @@ -158,7 +153,6 @@ def transformers_samplers(): loaders_samplers = { 'Transformers': transformers_samplers(), - 'HQQ': transformers_samplers(), 'ExLlamav3_HF': { 'temperature', 'dynatemp_low', @@ -299,7 +293,7 @@ loaders_samplers = { 'typical_p', 'xtc_threshold', 'xtc_probability', - 'tfs', + 'top_n_sigma', 'dry_multiplier', 'dry_allowed_length', 'dry_base', diff --git a/modules/logits.py b/modules/logits.py index 32aef7ae..56a20572 100644 --- a/modules/logits.py +++ b/modules/logits.py @@ -7,6 +7,7 @@ from modules import models, shared from modules.logging_colors import logger from modules.models import load_model from modules.text_generation import generate_reply +from modules.utils import check_model_loaded global_scores = None @@ -33,9 +34,9 @@ def get_next_logits(*args, **kwargs): def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False): - if shared.model is None: - logger.error("No model is loaded! Select one in the Model tab.") - return 'Error: No model is loaded1 Select one in the Model tab.', previous + model_is_loaded, error_message = check_model_loaded() + if not model_is_loaded: + return error_message, previous # llama.cpp case if shared.model.__class__.__name__ == 'LlamaServer': diff --git a/modules/models.py b/modules/models.py index d0b0402a..d329ae3c 100644 --- a/modules/models.py +++ b/modules/models.py @@ -21,7 +21,6 @@ def load_model(model_name, loader=None): 'ExLlamav3_HF': ExLlamav3_HF_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, 'ExLlamav2': ExLlamav2_loader, - 'HQQ': HQQ_loader, 'TensorRT-LLM': TensorRT_LLM_loader, } @@ -71,7 +70,6 @@ def llama_cpp_server_loader(model_name): else: model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0] - logger.info(f"llama.cpp weights detected: \"{model_file}\"") try: model = LlamaServer(model_file) return model, model @@ -103,21 +101,6 @@ def ExLlamav2_loader(model_name): return model, tokenizer -def HQQ_loader(model_name): - try: - from hqq.core.quantize import HQQBackend, HQQLinear - from hqq.models.hf.base import AutoHQQHFModel - except ModuleNotFoundError: - raise ModuleNotFoundError("Failed to import 'hqq'. Please install it manually following the instructions in the HQQ GitHub repository.") - - logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"") - - model_dir = Path(f'{shared.args.model_dir}/{model_name}') - model = AutoHQQHFModel.from_quantized(str(model_dir)) - HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend)) - return model - - def TensorRT_LLM_loader(model_name): try: from modules.tensorrt_llm import TensorRTLLMModel @@ -133,10 +116,13 @@ def unload_model(keep_model_name=False): return is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer') + if shared.args.loader == 'ExLlamav3_HF': + shared.model.unload() shared.model = shared.tokenizer = None shared.lora_names = [] shared.model_dirty_from_training = False + if not is_llamacpp: from modules.torch_utils import clear_torch_cache clear_torch_cache() diff --git a/modules/models_settings.py b/modules/models_settings.py index ae589bb3..c914bdea 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -1,7 +1,11 @@ +import functools import json import re +import subprocess +from math import floor from pathlib import Path +import gradio as gr import yaml from modules import chat, loaders, metadata_gguf, shared, ui @@ -54,7 +58,7 @@ def get_model_metadata(model): else: model_file = list(path.glob('*.gguf'))[0] - metadata = metadata_gguf.load_metadata(model_file) + metadata = load_gguf_metadata_with_cache(model_file) for k in metadata: if k.endswith('context_length'): @@ -67,7 +71,8 @@ def get_model_metadata(model): elif k.endswith('rope.scaling.factor'): model_settings['compress_pos_emb'] = metadata[k] elif k.endswith('block_count'): - model_settings['n_gpu_layers'] = metadata[k] + 1 + model_settings['gpu_layers'] = metadata[k] + 1 + model_settings['max_gpu_layers'] = metadata[k] + 1 if 'tokenizer.chat_template' in metadata: template = metadata['tokenizer.chat_template'] @@ -149,7 +154,11 @@ def get_model_metadata(model): for pat in settings: if re.match(pat.lower(), Path(model).name.lower()): for k in settings[pat]: - model_settings[k] = settings[pat][k] + new_k = k + if k == 'n_gpu_layers': + new_k = 'gpu_layers' + + model_settings[new_k] = settings[pat][k] # Load instruction template if defined by name rather than by value if model_settings['instruction_template'] != 'Custom (obtained from model metadata)': @@ -174,8 +183,6 @@ def infer_loader(model_name, model_settings, hf_quant_method=None): loader = 'ExLlamav3_HF' elif re.match(r'.*exl2', model_name.lower()): loader = 'ExLlamav2_HF' - elif re.match(r'.*-hqq', model_name.lower()): - return 'HQQ' else: loader = 'Transformers' @@ -209,15 +216,27 @@ def apply_model_settings_to_state(model, state): model_settings = get_model_metadata(model) if 'loader' in model_settings: loader = model_settings.pop('loader') - - # If the user is using an alternative loader for the same model type, let them keep using it if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']): state['loader'] = loader for k in model_settings: - if k in state: + if k in state and k != 'gpu_layers': # Skip gpu_layers, handle separately state[k] = model_settings[k] + # Handle GPU layers and VRAM update for llama.cpp + if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings: + vram_info, gpu_layers_update = update_gpu_layers_and_vram( + state['loader'], + model, + model_settings['gpu_layers'], + state['ctx_size'], + state['cache_type'], + auto_adjust=True + ) + + state['gpu_layers'] = gpu_layers_update + state['vram_info'] = vram_info + return state @@ -277,3 +296,197 @@ def save_instruction_template(model, template): yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.") else: yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.") + + +@functools.lru_cache(maxsize=1) +def load_gguf_metadata_with_cache(model_file): + return metadata_gguf.load_metadata(model_file) + + +def get_model_size_mb(model_file: Path) -> float: + filename = model_file.name + + # Check for multipart pattern + match = re.match(r'(.+)-\d+-of-\d+\.gguf$', filename) + + if match: + # It's a multipart file, find all matching parts + base_pattern = match.group(1) + part_files = sorted(model_file.parent.glob(f'{base_pattern}-*-of-*.gguf')) + total_size = sum(p.stat().st_size for p in part_files) + else: + # Single part + total_size = model_file.stat().st_size + + return total_size / (1024 ** 2) # Return size in MB + + +def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): + model_file = Path(f'{shared.args.model_dir}/{gguf_file}') + metadata = load_gguf_metadata_with_cache(model_file) + size_in_mb = get_model_size_mb(model_file) + + # Extract values from metadata + n_layers = None + n_kv_heads = None + embedding_dim = None + + for key, value in metadata.items(): + if key.endswith('.block_count'): + n_layers = value + elif key.endswith('.attention.head_count_kv'): + n_kv_heads = max(value) if isinstance(value, list) else value + elif key.endswith('.embedding_length'): + embedding_dim = value + + if gpu_layers > n_layers: + gpu_layers = n_layers + + # Convert cache_type to numeric + if cache_type == 'q4_0': + cache_type = 4 + elif cache_type == 'q8_0': + cache_type = 8 + else: + cache_type = 16 + + # Derived features + size_per_layer = size_in_mb / max(n_layers, 1e-6) + kv_cache_factor = n_kv_heads * cache_type * ctx_size + embedding_per_context = embedding_dim / ctx_size + + # Calculate VRAM using the model + # Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/ + vram = ( + (size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor) + * (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632))) + + 1516.522943869404 + ) + + return vram + + +def get_nvidia_vram(return_free=True): + """ + Calculates VRAM statistics across all NVIDIA GPUs by parsing nvidia-smi output. + + Args: + return_free (bool): If True, returns free VRAM. If False, returns total VRAM. + + Returns: + int: Either the total free VRAM or total VRAM in MiB summed across all detected NVIDIA GPUs. + Returns -1 if nvidia-smi command fails (not found, error, etc.). + Returns 0 if nvidia-smi succeeds but no GPU memory info found. + """ + try: + # Execute nvidia-smi command + result = subprocess.run( + ['nvidia-smi'], + capture_output=True, + text=True, + check=False + ) + + # Check if nvidia-smi returned an error + if result.returncode != 0: + return -1 + + # Parse the output for memory usage patterns + output = result.stdout + + # Find memory usage like "XXXXMiB / YYYYMiB" + # Captures used and total memory for each GPU + matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output) + + if not matches: + # No GPUs found in expected format + return 0 + + total_vram_mib = 0 + total_free_vram_mib = 0 + + for used_mem_str, total_mem_str in matches: + try: + used_mib = int(used_mem_str) + total_mib = int(total_mem_str) + total_vram_mib += total_mib + total_free_vram_mib += (total_mib - used_mib) + except ValueError: + # Skip malformed entries + pass + + # Return either free or total VRAM based on the flag + return total_free_vram_mib if return_free else total_vram_mib + + except FileNotFoundError: + # nvidia-smi not found (likely no NVIDIA drivers installed) + return -1 + except Exception: + # Handle any other unexpected exceptions + return -1 + + +def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True): + """ + Unified function to handle GPU layers and VRAM updates. + + Args: + for_ui: If True, returns Gradio updates. If False, returns raw values. + + Returns: + - If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update + - If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage + """ + if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"): + vram_info = "
Estimated VRAM to load the model:
" + if for_ui: + return (vram_info, gr.update()) if auto_adjust else vram_info + else: + return (0, gpu_layers) if auto_adjust else 0 + + current_layers = gpu_layers + max_layers = gpu_layers + + if auto_adjust: + # Get model settings including user preferences + model_settings = get_model_metadata(model) + + # Get the true maximum layers + max_layers = model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', gpu_layers)) + + # Check if this is a user-saved setting + user_config = shared.user_config + model_regex = Path(model).name + '$' + has_user_setting = model_regex in user_config and 'gpu_layers' in user_config[model_regex] + + if has_user_setting: + # For user settings, just use the current value (which already has user pref) + # but ensure the slider maximum is correct + current_layers = gpu_layers # Already has user setting + else: + # No user setting, auto-adjust from the maximum + current_layers = max_layers # Start from max + + # Auto-adjust based on available/total VRAM + # If a model is loaded and it's for the UI, use the total VRAM to avoid confusion + return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True + available_vram = get_nvidia_vram(return_free=return_free) + if available_vram > 0: + tolerance = 577 + while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance: + current_layers -= 1 + + # Calculate VRAM with current layers + vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type) + + if for_ui: + vram_info = f"
Estimated VRAM to load the model: {vram_usage:.0f} MiB
" + if auto_adjust: + return vram_info, gr.update(value=current_layers, maximum=max_layers) + else: + return vram_info + else: + if auto_adjust: + return vram_usage, current_layers + else: + return vram_usage diff --git a/modules/presets.py b/modules/presets.py index a432bf52..cf706605 100644 --- a/modules/presets.py +++ b/modules/presets.py @@ -11,7 +11,7 @@ from modules.logging_colors import logger def default_preset(): - return { + result = { 'temperature': 1, 'dynatemp_low': 1, 'dynatemp_high': 1, @@ -46,10 +46,17 @@ def default_preset(): 'do_sample': True, 'dynamic_temperature': False, 'temperature_last': False, - 'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_n_sigma\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram', + 'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntop_n_sigma\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram', 'dry_sequence_breakers': '"\\n", ":", "\\"", "*"', } + if shared.args.portable: + samplers = result['sampler_priority'].split('\n') + samplers = [sampler for sampler in samplers if sampler in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc", "typical_p", "repetition_penalty"]] + result['sampler_priority'] = '\n'.join(samplers) + + return result + def presets_params(): return [k for k in default_preset()] diff --git a/modules/shared.py b/modules/shared.py index fb10c014..d2305f30 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -60,7 +60,6 @@ settings = { 'custom_stopping_strings': '', 'custom_token_bans': '', 'negative_prompt': '', - 'autoload_model': False, 'dark_theme': True, 'default_extensions': [], 'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}", @@ -88,7 +87,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft # Model loader group = parser.add_argument_group('Model loader') -group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ, TensorRT-LLM.') +group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') @@ -121,7 +120,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.') group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.') group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') -group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.') +group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.') group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.') group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.') group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') @@ -130,9 +129,9 @@ group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') # Cache -group = parser.add_argument_group('Context and cache management') +group = parser.add_argument_group('Context and cache') group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.') -group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).') +group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).') # Speculative decoding group = parser.add_argument_group('Speculative decoding') @@ -153,18 +152,10 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.') group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.') -# HQQ -group = parser.add_argument_group('HQQ') -group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.') - # TensorRT-LLM group = parser.add_argument_group('TensorRT-LLM') group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.') -# Cache -group = parser.add_argument_group('Cache') -group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.') - # DeepSpeed group = parser.add_argument_group('DeepSpeed') group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.') @@ -190,6 +181,7 @@ group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certific group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None) group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy') group.add_argument('--old-colors', action='store_true', help='Use the legacy Gradio colors, before the December/2024 update.') +group.add_argument('--portable', action='store_true', help='Hide features not available in portable mode like training.') # API group = parser.add_argument_group('API') @@ -267,8 +259,6 @@ def fix_loader_name(name): return 'ExLlamav2_HF' elif name in ['exllamav3-hf', 'exllamav3_hf', 'exllama-v3-hf', 'exllama_v3_hf', 'exllama-v3_hf', 'exllama3-hf', 'exllama3_hf', 'exllama-3-hf', 'exllama_3_hf', 'exllama-3_hf']: return 'ExLlamav3_HF' - elif name in ['hqq']: - return 'HQQ' elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']: return 'TensorRT-LLM' @@ -311,11 +301,13 @@ if args.api or args.public_api: add_extension('openai', last=True) # Load model-specific settings -with Path(f'{args.model_dir}/config.yaml') as p: - if p.exists(): - model_config = yaml.safe_load(open(p, 'r').read()) - else: - model_config = {} +p = Path(f'{args.model_dir}/config.yaml') +if p.exists(): + model_config = yaml.safe_load(open(p, 'r').read()) +else: + model_config = {} +del p + # Load custom model-specific settings user_config = load_user_config() diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py index 73178c39..0527d493 100644 --- a/modules/tensorrt_llm.py +++ b/modules/tensorrt_llm.py @@ -1,15 +1,15 @@ from pathlib import Path -import torch - import tensorrt_llm +import torch +from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp + from modules import shared from modules.logging_colors import logger from modules.text_generation import ( get_max_prompt_length, get_reply_from_output_ids ) -from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp class TensorRTLLMModel: diff --git a/modules/text_generation.py b/modules/text_generation.py index 8d091868..1fd6d810 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -14,6 +14,7 @@ from modules.callbacks import Iteratorize from modules.extensions import apply_extensions from modules.html_generator import generate_basic_html from modules.logging_colors import logger +from modules.utils import check_model_loaded def generate_reply(*args, **kwargs): @@ -34,8 +35,8 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap # Find the appropriate generation function generate_func = apply_extensions('custom_generate_reply') if generate_func is None: - if shared.model_name == 'None' or shared.model is None: - logger.error("No model is loaded! Select one in the Model tab.") + model_is_loaded, error_message = check_model_loaded() + if not model_is_loaded: yield '' return @@ -471,7 +472,7 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None, t1 = time.time() original_tokens = len(original_input_ids[0]) new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0) - print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})') + logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})') return @@ -480,7 +481,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N For models that do not use the transformers library for sampling """ - seed = set_manual_seed(state['seed']) + state['seed'] = set_manual_seed(state['seed']) t0 = time.time() reply = '' try: @@ -500,15 +501,15 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N t1 = time.time() original_tokens = len(encode(original_question)[0]) new_tokens = len(encode(original_question + reply)[0]) - original_tokens - print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})') + logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {state["seed"]})') return -def print_prompt(prompt, max_chars=2000): +def print_prompt(prompt, max_chars=-1): DARK_YELLOW = "\033[38;5;3m" RESET = "\033[0m" - if len(prompt) > max_chars: + if max_chars > 0 and len(prompt) > max_chars: half_chars = max_chars // 2 hidden_len = len(prompt[half_chars:-half_chars]) hidden_msg = f"{DARK_YELLOW}[...{hidden_len} characters hidden...]{RESET}" diff --git a/modules/ui.py b/modules/ui.py index fb016f87..9f4d67cb 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -61,7 +61,7 @@ if not shared.args.old_colors: background_fill_primary_dark='var(--darker-gray)', body_background_fill="white", block_background_fill="transparent", - body_text_color="#333", + body_text_color='rgb(64, 64, 64)', button_secondary_background_fill="#f4f4f4", button_secondary_border_color="var(--border-color-primary)", @@ -71,6 +71,7 @@ if not shared.args.old_colors: block_background_fill_dark='transparent', block_border_color_dark='transparent', input_border_color_dark='var(--border-color-dark)', + input_border_color_focus_dark='var(--border-color-dark)', checkbox_border_color_dark='var(--border-color-dark)', border_color_primary_dark='var(--border-color-dark)', button_secondary_border_color_dark='var(--border-color-dark)', @@ -89,6 +90,8 @@ if not shared.args.old_colors: checkbox_label_shadow='none', block_shadow='none', block_shadow_dark='none', + input_shadow_focus='none', + input_shadow_focus_dark='none', button_large_radius='0.375rem', button_large_padding='6px 12px', input_radius='0.375rem', @@ -105,11 +108,10 @@ def list_model_elements(): 'filter_by_loader', 'loader', 'cpu_memory', - 'n_gpu_layers', + 'gpu_layers', 'threads', 'threads_batch', 'batch_size', - 'hqq_backend', 'ctx_size', 'cache_type', 'tensor_split', @@ -211,6 +213,15 @@ def list_interface_input_elements(): 'negative_prompt', 'dry_sequence_breakers', 'grammar_string', + 'navigate_message_index', + 'navigate_direction', + 'navigate_message_role', + 'edit_message_index', + 'edit_message_text', + 'edit_message_role', + 'branch_index', + 'enable_web_search', + 'web_search_pages', ] # Chat elements diff --git a/modules/ui_chat.py b/modules/ui_chat.py index 0d588549..d79aa523 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -24,7 +24,8 @@ def create_ui(): with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']): with gr.Column(): with gr.Row(elem_id='past-chats-buttons'): - shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', interactive=not mu) + shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', elem_id='Branch', interactive=not mu) + shared.gradio['branch_index'] = gr.Number(value=-1, precision=0, visible=False, elem_id="Branch-index", interactive=True) shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu) shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu) shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input']) @@ -46,14 +47,14 @@ def create_ui(): with gr.Row(): with gr.Column(elem_id='chat-col'): - shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True) shared.gradio['display'] = gr.JSON(value={}, visible=False) # Hidden buffer + shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': [], 'metadata': {}}, '', '', 'chat', 'cai-chat', '')['html'], visible=True) with gr.Row(elem_id="chat-input-row"): with gr.Column(scale=1, elem_id='gr-hover-container'): gr.HTML(value='
', elem_id='gr-hover') with gr.Column(scale=10, elem_id='chat-input-container'): - shared.gradio['textbox'] = gr.Textbox(label='', placeholder='Send a message', elem_id='chat-input', elem_classes=['add_scrollbar']) + shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar']) shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls') shared.gradio['typing-dots'] = gr.HTML(value='
', label='typing', elem_id='typing-container') @@ -70,8 +71,6 @@ def create_ui(): shared.gradio['Remove last'] = gr.Button('Remove last reply (Ctrl + Shift + Backspace)', elem_id='Remove-last') with gr.Row(): - shared.gradio['Replace last reply'] = gr.Button('Replace last reply (Ctrl + Shift + L)', elem_id='Replace-last') - shared.gradio['Copy last reply'] = gr.Button('Copy last reply (Ctrl + Shift + K)', elem_id='Copy-last') shared.gradio['Impersonate'] = gr.Button('Impersonate (Ctrl + Shift + M)', elem_id='Impersonate') with gr.Row(): @@ -79,14 +78,20 @@ def create_ui(): shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply') with gr.Row(): - shared.gradio['send-chat-to-default'] = gr.Button('Send to default') - shared.gradio['send-chat-to-notebook'] = gr.Button('Send to notebook') + shared.gradio['send-chat-to-default'] = gr.Button('Send to Default') + shared.gradio['send-chat-to-notebook'] = gr.Button('Send to Notebook') with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']): with gr.Column(): with gr.Row(): shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar']) + with gr.Row(): + shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search') + + with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']: + shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10) + with gr.Row(): shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode') @@ -96,6 +101,22 @@ def create_ui(): with gr.Row(): shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar']) + with gr.Row(): + shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm') + + shared.gradio['token_display'] = gr.HTML(value='', elem_classes='token-display') + + # Hidden elements for version navigation and editing + with gr.Row(visible=False): + shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index") + shared.gradio['navigate_direction'] = gr.Textbox(value="", elem_id="Navigate-direction") + shared.gradio['navigate_message_role'] = gr.Textbox(value="", elem_id="Navigate-message-role") + shared.gradio['navigate_version'] = gr.Button(elem_id="Navigate-version") + shared.gradio['edit_message_index'] = gr.Number(value=-1, precision=0, elem_id="Edit-message-index") + shared.gradio['edit_message_text'] = gr.Textbox(value="", elem_id="Edit-message-text") + shared.gradio['edit_message_role'] = gr.Textbox(value="", elem_id="Edit-message-role") + shared.gradio['edit_message'] = gr.Button(elem_id="Edit-message") + def create_chat_settings_ui(): mu = shared.args.multi_user @@ -185,7 +206,7 @@ def create_event_handlers(): shared.gradio['Generate'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( + lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then( chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then( None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then( @@ -193,7 +214,7 @@ def create_event_handlers(): shared.gradio['textbox'].submit( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( + lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then( chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then( None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then( @@ -221,10 +242,6 @@ def create_event_handlers(): None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then( None, None, None, js=f'() => {{{ui.audio_notification_js}}}') - shared.gradio['Replace last reply'].click( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.handle_replace_last_reply_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False) - shared.gradio['Send dummy message'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( chat.handle_send_dummy_message_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False) @@ -258,7 +275,7 @@ def create_event_handlers(): shared.gradio['branch_chat'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False) + chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id', 'branch_index'), show_progress=False) shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename-row'), show_progress=False) shared.gradio['rename_to-cancel'].click(lambda: gr.update(visible=False), None, gradio('rename-row'), show_progress=False) @@ -290,7 +307,14 @@ def create_event_handlers(): None, gradio('mode'), None, js="(mode) => {mode === 'instruct' ? document.getElementById('character-menu').parentNode.parentNode.style.display = 'none' : document.getElementById('character-menu').parentNode.parentNode.style.display = ''}") shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'), show_progress=False) - shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False) + + shared.gradio['navigate_version'].click( + ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( + chat.handle_navigate_version_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False) + + shared.gradio['edit_message'].click( + ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( + chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False) # Save/delete a character shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False) @@ -347,3 +371,13 @@ def create_event_handlers(): None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}') shared.gradio['show_controls'].change(None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}') + + shared.gradio['count_tokens'].click( + ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( + chat.count_prompt_tokens, gradio('textbox', 'interface_state'), gradio('token_display'), show_progress=False) + + shared.gradio['enable_web_search'].change( + lambda x: gr.update(visible=x), + gradio('enable_web_search'), + gradio('web_search_row') + ) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index d13bcff7..862b3893 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -14,6 +14,7 @@ from modules.models_settings import ( get_model_metadata, save_instruction_template, save_model_settings, + update_gpu_layers_and_vram, update_model_parameters ) from modules.utils import gradio @@ -26,71 +27,34 @@ def create_ui(): with gr.Row(): with gr.Column(): with gr.Row(): - with gr.Column(): - with gr.Row(): - shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu) - ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) - shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button', interactive=not mu) - shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu) - shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu) + shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu) + ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) + shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu) + shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu) + shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu) - with gr.Column(): - with gr.Row(): - shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu) - ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu) - shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu) - - with gr.Row(): - with gr.Column(): - shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None) + shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None) with gr.Blocks(): + gr.Markdown("## Main options") with gr.Row(): with gr.Column(): - shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') - shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads) - shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) - shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size) - shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) - shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.') - shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') - shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40') + shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') + shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') - shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags) - shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory) - shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.') - shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.') - shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.') - shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.') - shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.') - shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.') - + shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') with gr.Column(): + shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info()) + shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.') + shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.') - shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') - shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') - shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.') - shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk) - shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.') - shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') - shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap) - shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock) - shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.') shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.') - shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.') - shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16) shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.') shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).') - shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn) - shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers) - shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa) - shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.') shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) - shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') - shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') # Speculative decoding @@ -99,15 +63,50 @@ def create_ui(): shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu) ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) - shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.') shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.') + shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding. Recommended value: 4.') shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1') shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.') - with gr.Column(): - with gr.Row(): - shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu) + gr.Markdown("## Other options") + with gr.Accordion("See more options", open=False, elem_classes='tgw-accordion'): + with gr.Row(): + with gr.Column(): + shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads) + shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) + shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size) + shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40') + shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags) + shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory) + shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.') + shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.') + shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.') + shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.') + shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.') + shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.') + with gr.Column(): + shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.') + shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk) + shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.') + shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') + shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap) + shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock) + shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.') + shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.') + shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16) + shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn) + shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers) + shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa) + shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') + shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.') + if not shared.args.portable: + with gr.Row(): + shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu) + ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu) + shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu) + + with gr.Column(): with gr.Tab("Download"): shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu) shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1, interactive=not mu) @@ -132,11 +131,10 @@ def create_event_handlers(): # In this event handler, the interface state is read and updated # with the model defaults (if any), and then the model is loaded - # unless "autoload_model" is unchecked shared.gradio['model_menu'].change( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then( - load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=True).success( + partial(load_model_wrapper, autoload=False), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success( handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False) shared.gradio['load_model'].click( @@ -145,15 +143,31 @@ def create_event_handlers(): partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success( handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False) - shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False) + shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False).then( + partial(update_gpu_layers_and_vram, auto_adjust=True), gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info', 'gpu_layers'), show_progress=False) + shared.gradio['save_model_settings'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False) - shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False) + # For ctx_size and cache_type - auto-adjust GPU layers + for param in ['ctx_size', 'cache_type']: + shared.gradio[param].change( + partial(update_gpu_layers_and_vram, auto_adjust=True), + gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), + gradio('vram_info', 'gpu_layers'), show_progress=False) + + # For manual gpu_layers changes - only update VRAM + shared.gradio['gpu_layers'].change( + partial(update_gpu_layers_and_vram, auto_adjust=False), + gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), + gradio('vram_info'), show_progress=False) + + if not shared.args.portable: + shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False) + shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True) shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True) - shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model')) shared.gradio['customized_template_submit'].click(save_instruction_template, gradio('model_menu', 'customized_template'), gradio('model_status'), show_progress=True) @@ -192,6 +206,26 @@ def load_lora_wrapper(selected_loras): def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False): try: + # Handle direct GGUF URLs + if repo_id.startswith("https://") and ("huggingface.co" in repo_id) and (repo_id.endswith(".gguf") or repo_id.endswith(".gguf?download=true")): + try: + path = repo_id.split("huggingface.co/")[1] + + # Extract the repository ID (first two parts of the path) + parts = path.split("/") + if len(parts) >= 2: + extracted_repo_id = f"{parts[0]}/{parts[1]}" + + # Extract the filename (last part of the path) + filename = repo_id.split("/")[-1] + if "?download=true" in filename: + filename = filename.replace("?download=true", "") + + repo_id = extracted_repo_id + specific_file = filename + except: + pass + if repo_id == "": yield ("Please enter a model path") return @@ -205,6 +239,18 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur yield ("Getting the download links from Hugging Face") links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file) + + # Check for multiple GGUF files + gguf_files = [link for link in links if link.lower().endswith('.gguf')] + if len(gguf_files) > 1 and not specific_file: + output = "Multiple GGUF files found. Please copy one of the following filenames to the 'File name' field:\n\n```\n" + for link in gguf_files: + output += f"{Path(link).name}\n" + + output += "```" + yield output + return + if return_links: output = "```\n" for link in links: @@ -252,10 +298,34 @@ def update_truncation_length(current_length, state): return current_length +def get_initial_vram_info(): + if shared.model_name != 'None' and shared.args.loader == 'llama.cpp': + return update_gpu_layers_and_vram( + shared.args.loader, + shared.model_name, + shared.args.gpu_layers, + shared.args.ctx_size, + shared.args.cache_type, + auto_adjust=False, + for_ui=True + ) + + return "
Estimated VRAM to load the model:
" + + +def get_initial_gpu_layers_max(): + if shared.model_name != 'None' and shared.args.loader == 'llama.cpp': + model_settings = get_model_metadata(shared.model_name) + return model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', 256)) + + return 256 + + def handle_load_model_event_initial(model, state): state = apply_model_settings_to_state(model, state) output = ui.apply_interface_values(state) - update_model_parameters(state) + update_model_parameters(state) # This updates the command-line flags + return output + [state] diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index 3f609d71..733d0901 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -21,7 +21,7 @@ def create_ui(default_preset): shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button') with gr.Column(): - shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown') + shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()) if not shared.args.portable else ['llama.cpp'], value="All", elem_classes='slim-dropdown') with gr.Row(): with gr.Column(): @@ -82,7 +82,7 @@ def create_ui(default_preset): shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.') shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.') shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.') - shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='enable_thinking', info='Used by Qwen3 to toggle mode.') + shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Qwen3 to toggle mode.') shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.') shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming') shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.') diff --git a/modules/ui_session.py b/modules/ui_session.py index 7cf9f6e6..a4eba667 100644 --- a/modules/ui_session.py +++ b/modules/ui_session.py @@ -23,11 +23,15 @@ def create_ui(): shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table') with gr.Column(): - extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️ WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu) - extension_status = gr.Markdown() + if not shared.args.portable: + extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️ WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu) + extension_status = gr.Markdown() + else: + pass shared.gradio['theme_state'] = gr.Textbox(visible=False, value='dark' if shared.settings['dark_theme'] else 'light') - extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False) + if not shared.args.portable: + extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False) # Reset interface event shared.gradio['reset_interface'].click( diff --git a/modules/utils.py b/modules/utils.py index 77324139..577c55b8 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -72,6 +72,20 @@ def natural_keys(text): return [atoi(c) for c in re.split(r'(\d+)', text)] +def check_model_loaded(): + if shared.model_name == 'None' or shared.model is None: + if len(get_available_models()) == 0: + error_msg = "No model is loaded.\n\nTo get started:\n1) Place a GGUF file in your user_data/models folder\n2) Go to the Model tab and select it" + logger.error(error_msg) + return False, error_msg + else: + error_msg = "No model is loaded. Please select one in the Model tab." + logger.error(error_msg) + return False, error_msg + + return True, None + + def get_available_models(): # Get all GGUF files gguf_files = get_available_ggufs() @@ -123,7 +137,7 @@ def get_available_models(): model_dirs = sorted(model_dirs, key=natural_keys) - return ['None'] + filtered_gguf_files + model_dirs + return filtered_gguf_files + model_dirs def get_available_ggufs(): diff --git a/modules/web_search.py b/modules/web_search.py new file mode 100644 index 00000000..1f670349 --- /dev/null +++ b/modules/web_search.py @@ -0,0 +1,129 @@ +import concurrent.futures +from concurrent.futures import as_completed +from datetime import datetime + +import requests +from bs4 import BeautifulSoup +from duckduckgo_search import DDGS + +from modules.logging_colors import logger + + +def get_current_timestamp(): + """Returns the current time in 24-hour format""" + return datetime.now().strftime('%b %d, %Y %H:%M') + + +def download_web_page(url, timeout=5): + """Download and extract text from a web page""" + try: + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + response = requests.get(url, headers=headers, timeout=timeout) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html.parser') + + # Remove script and style elements + for script in soup(["script", "style"]): + script.decompose() + + # Get text and clean it up + text = soup.get_text() + lines = (line.strip() for line in text.splitlines()) + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + text = ' '.join(chunk for chunk in chunks if chunk) + + return text + except Exception as e: + logger.error(f"Error downloading {url}: {e}") + return f"[Error downloading content from {url}: {str(e)}]" + + +def perform_web_search(query, num_pages=3, max_workers=5): + """Perform web search and return results with content""" + try: + with DDGS() as ddgs: + results = list(ddgs.text(query, max_results=num_pages)) + + # Prepare download tasks + download_tasks = [] + for i, result in enumerate(results): + url = result.get('href', '') + title = result.get('title', f'Search Result {i+1}') + download_tasks.append((url, title, i)) + + search_results = [None] * len(download_tasks) # Pre-allocate to maintain order + + # Download pages in parallel + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all download tasks + future_to_task = { + executor.submit(download_web_page, task[0]): task + for task in download_tasks + } + + # Collect results as they complete + for future in as_completed(future_to_task): + url, title, index = future_to_task[future] + try: + content = future.result() + search_results[index] = { + 'title': title, + 'url': url, + 'content': content + } + except Exception as e: + logger.error(f"Error downloading {url}: {e}") + # Include failed downloads with empty content + search_results[index] = { + 'title': title, + 'url': url, + 'content': '' + } + + return search_results + + except Exception as e: + logger.error(f"Error performing web search: {e}") + return [] + + +def add_web_search_attachments(history, row_idx, user_message, search_query, state): + """Perform web search and add results as attachments""" + if not search_query: + logger.warning("No search query provided") + return + + try: + logger.info(f"Using search query: {search_query}") + + # Perform web search + num_pages = int(state.get('web_search_pages', 3)) + search_results = perform_web_search(search_query, num_pages) + + if not search_results: + logger.warning("No search results found") + return + + # Add search results as attachments + key = f"user_{row_idx}" + if key not in history['metadata']: + history['metadata'][key] = {"timestamp": get_current_timestamp()} + if "attachments" not in history['metadata'][key]: + history['metadata'][key]["attachments"] = [] + + for result in search_results: + attachment = { + "name": result['title'], + "type": "text/html", + "url": result['url'], + "content": result['content'] + } + history['metadata'][key]["attachments"].append(attachment) + + logger.info(f"Added {len(search_results)} web search results as attachments") + + except Exception as e: + logger.error(f"Error in web search: {e}") diff --git a/one_click.py b/one_click.py index 065afd99..482a6aa9 100644 --- a/one_click.py +++ b/one_click.py @@ -126,7 +126,7 @@ def check_env(): sys.exit(1) # Ensure this is a new environment and not the base environment - if os.environ["CONDA_DEFAULT_ENV"] == "base": + if os.environ.get("CONDA_DEFAULT_ENV", "") == "base": print("Create an environment for this project and activate it. Exiting...") sys.exit(1) @@ -222,7 +222,7 @@ def update_pytorch_and_python(): if "+cu" in torver: install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124" elif "+rocm" in torver: - install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1" + install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4" elif "+cpu" in torver: install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu" elif "+cxx11" in torver: @@ -273,7 +273,7 @@ def install_webui(): "What is your GPU?", { 'A': 'NVIDIA - CUDA 12.4', - 'B': 'AMD - Linux/macOS only, requires ROCm 6.1', + 'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4', 'C': 'Apple M Series', 'D': 'Intel Arc (beta)', 'N': 'CPU mode' @@ -314,7 +314,7 @@ def install_webui(): if selected_gpu == "NVIDIA": install_pytorch += "--index-url https://download.pytorch.org/whl/cu124" elif selected_gpu == "AMD": - install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.1" + install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.2.4" elif selected_gpu in ["APPLE", "NONE"]: install_pytorch += "--index-url https://download.pytorch.org/whl/cpu" elif selected_gpu == "INTEL": diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 6f265eba..2c322715 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -1,7 +1,9 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 bitsandbytes==0.45.* colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* @@ -13,6 +15,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -30,12 +33,12 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index c8e75ee7..6aeb325e 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* @@ -12,6 +14,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -29,6 +32,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index e54d6d9c..3b052423 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* @@ -12,6 +14,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -29,6 +32,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index d714ea3d..8c51459e 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* @@ -12,6 +14,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -29,7 +32,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 89f4f576..b9f15d45 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* @@ -12,6 +14,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -29,8 +32,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 47ad5759..0877d968 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* @@ -12,6 +14,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -29,5 +32,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 334f11df..cab78237 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* @@ -12,6 +14,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -29,5 +32,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index e216c9cd..dfd42577 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -1,7 +1,9 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 bitsandbytes==0.45.* colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* @@ -13,6 +15,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -30,12 +33,12 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 2e631bf0..5d9f84ce 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -1,6 +1,8 @@ accelerate==1.5.* +beautifulsoup4==4.13.4 colorama datasets +duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* @@ -12,6 +14,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index c720daa7..fdae681d 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -1,9 +1,12 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt deleted file mode 100644 index 7d9c00c0..00000000 --- a/requirements/portable/requirements_amd.txt +++ /dev/null @@ -1,18 +0,0 @@ -fastapi==0.112.4 -gradio==4.37.* -jinja2==3.1.6 -markdown -numpy==1.26.* -pydantic==2.8.2 -pyyaml -requests -rich -tqdm - -# API -flask_cloudflared==0.0.14 -sse-starlette==1.6.5 -tiktoken - -# AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt deleted file mode 100644 index d718c1b1..00000000 --- a/requirements/portable/requirements_amd_noavx2.txt +++ /dev/null @@ -1,18 +0,0 @@ -fastapi==0.112.4 -gradio==4.37.* -jinja2==3.1.6 -markdown -numpy==1.26.* -pydantic==2.8.2 -pyyaml -requests -rich -tqdm - -# API -flask_cloudflared==0.0.14 -sse-starlette==1.6.5 -tiktoken - -# AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 9e184b53..a58f39f7 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -1,9 +1,12 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index ec059716..91ea3a6d 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -1,9 +1,12 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,6 +18,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index d473b824..37e5aa40 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -1,9 +1,12 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index d3fffb43..dcb2884b 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -1,9 +1,12 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index cdfa6a01..8f1295bb 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -1,9 +1,12 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt index 6f9566ba..21805fe2 100644 --- a/requirements/portable/requirements_nowheels.txt +++ b/requirements/portable/requirements_nowheels.txt @@ -1,9 +1,12 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 1a7ce6ed..858b4488 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -1,9 +1,12 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 4737321d..569bae99 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -1,9 +1,12 @@ +beautifulsoup4==4.13.4 +duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +18,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/server.py b/server.py index 169578a5..c22ed1f1 100644 --- a/server.py +++ b/server.py @@ -51,6 +51,7 @@ from modules.models import load_model, unload_model_if_idle from modules.models_settings import ( get_fallback_settings, get_model_metadata, + update_gpu_layers_and_vram, update_model_parameters ) from modules.shared import do_cmd_flags_warnings @@ -90,7 +91,7 @@ def create_interface(): 'instruction_template_str': shared.settings['instruction_template_str'], 'prompt_menu-default': shared.settings['prompt-default'], 'prompt_menu-notebook': shared.settings['prompt-notebook'], - 'filter_by_loader': shared.args.loader or 'All' + 'filter_by_loader': (shared.args.loader or 'All') if not shared.args.portable else 'llama.cpp' }) if Path("user_data/cache/pfp_character.png").exists(): @@ -127,7 +128,8 @@ def create_interface(): ui_parameters.create_ui(shared.settings['preset']) # Parameters tab ui_model_menu.create_ui() # Model tab - training.create_ui() # Training tab + if not shared.args.portable: + training.create_ui() # Training tab ui_session.create_ui() # Session tab # Generation events @@ -247,6 +249,20 @@ if __name__ == "__main__": model_settings = get_model_metadata(model_name) update_model_parameters(model_settings, initial=True) # hijack the command-line arguments + # Auto-adjust GPU layers if not provided by user and it's a llama.cpp model + if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings: + vram_usage, adjusted_layers = update_gpu_layers_and_vram( + shared.args.loader, + model_name, + model_settings['gpu_layers'], + shared.args.ctx_size, + shared.args.cache_type, + auto_adjust=True, + for_ui=False + ) + + shared.args.gpu_layers = adjusted_layers + # Load the model shared.model, shared.tokenizer = load_model(model_name) if shared.args.lora: diff --git a/start_linux.sh b/start_linux.sh index 00082f07..e2b00558 100755 --- a/start_linux.sh +++ b/start_linux.sh @@ -1,10 +1,15 @@ #!/usr/bin/env bash +# environment isolation +export PYTHONNOUSERSITE=1 +unset PYTHONPATH +unset PYTHONHOME + cd "$(dirname "${BASH_SOURCE[0]}")" # Portable install case if [ -d "portable_env" ]; then - ./portable_env/bin/python3 server.py --api --auto-launch "$@" + ./portable_env/bin/python3 server.py --portable --api --auto-launch "$@" exit $? fi @@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then exit fi -# environment isolation -export PYTHONNOUSERSITE=1 -unset PYTHONPATH -unset PYTHONHOME export CUDA_PATH="$INSTALL_ENV_DIR" export CUDA_HOME="$CUDA_PATH" diff --git a/start_macos.sh b/start_macos.sh index 628f59cc..bff11bc1 100755 --- a/start_macos.sh +++ b/start_macos.sh @@ -1,10 +1,15 @@ #!/bin/bash +# environment isolation +export PYTHONNOUSERSITE=1 +unset PYTHONPATH +unset PYTHONHOME + cd "$(dirname "${BASH_SOURCE[0]}")" # Portable install case if [ -d "portable_env" ]; then - ./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@" + ./portable_env/bin/python3 server.py --portable --api --auto-launch --api-port 5005 "$@" exit $? fi @@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then exit fi -# environment isolation -export PYTHONNOUSERSITE=1 -unset PYTHONPATH -unset PYTHONHOME export CUDA_PATH="$INSTALL_ENV_DIR" export CUDA_HOME="$CUDA_PATH" diff --git a/start_windows.bat b/start_windows.bat index 451b85e0..f5e66ec2 100755 --- a/start_windows.bat +++ b/start_windows.bat @@ -1,11 +1,16 @@ @echo off setlocal enabledelayedexpansion +@rem environment isolation +set PYTHONNOUSERSITE=1 +set PYTHONPATH= +set PYTHONHOME= + cd /D "%~dp0" @rem Portable install case if exist "portable_env" ( - .\portable_env\python.exe server.py --api --auto-launch %* + .\portable_env\python.exe server.py --portable --api --auto-launch %* exit /b %errorlevel% ) @@ -87,10 +92,6 @@ if not exist "%INSTALL_ENV_DIR%" ( @rem check if conda environment was actually created if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end ) -@rem environment isolation -set PYTHONNOUSERSITE=1 -set PYTHONPATH= -set PYTHONHOME= set "CUDA_PATH=%INSTALL_ENV_DIR%" set "CUDA_HOME=%CUDA_PATH%" diff --git a/user_data/settings-template.yaml b/user_data/settings-template.yaml index 20896da3..ce0f77e1 100644 --- a/user_data/settings-template.yaml +++ b/user_data/settings-template.yaml @@ -31,7 +31,6 @@ seed: -1 custom_stopping_strings: '' custom_token_bans: '' negative_prompt: '' -autoload_model: false dark_theme: true default_extensions: [] instruction_template_str: |-