Merge branch 'main' into patch-2

This commit is contained in:
Mykeehu 2025-06-04 08:46:55 +02:00 committed by GitHub
commit dddb887b60
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
72 changed files with 2334 additions and 642 deletions

View file

@ -102,6 +102,8 @@ jobs:
shell: bash
run: |
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables
CUDA_VERSION="${{ matrix.cuda }}"

View file

@ -101,6 +101,8 @@ jobs:
shell: bash
run: |
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables
AVX_SUPPORT="${{ matrix.avx }}"

View file

@ -101,6 +101,8 @@ jobs:
shell: bash
run: |
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables
AVX_SUPPORT="${{ matrix.avx }}"

View file

@ -12,18 +12,20 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
## Features
- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2).
- [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is also supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile).
- Additional quantization libraries like [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) can be used with the Transformers loader if you install them manually.
- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for llama.cpp GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
- UI that resembles the original ChatGPT style.
- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
- Multiple sampling parameters and generation options for sophisticated text generation control.
- Switch between different models easily in the UI without restarting, with fine control over settings.
- OpenAI-compatible API with Chat and Completions endpoints see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
- 100% offline and private, with zero telemetry, external resources, or remote update requests.
- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
- **File attachments**: Upload text files and PDF documents to talk about their contents.
- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
- Aesthetic UI with dark and light themes.
- `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters.
- Edit messages, navigate between message versions, and branch conversations at any point.
- Multiple sampling parameters and generation options for sophisticated text generation control.
- Switch between different models in the UI without restarting.
- Automatic GPU layers for GGUF models (on NVIDIA GPUs).
- Free-form text generation in the Default/Notebook tabs without being limited to chat turns.
- OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
- Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
## How to install
@ -44,7 +46,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases
To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again.
You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, or `update_wizard_macos.sh`.
<details>
<summary>
@ -55,12 +57,12 @@ Setup details and information about installing manually
The script uses Miniconda to set up a Conda environment in the `installer_files` folder.
If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, `cmd_macos.sh`, or `cmd_wsl.bat`.
If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, or `cmd_macos.sh`.
* There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root.
* To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts.
* For additional instructions about AMD and WSL setup, consult [the documentation](https://github.com/oobabooga/text-generation-webui/wiki).
* For automated installation, you can use the `GPU_CHOICE`, `USE_CUDA118`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
* For automated installation, you can use the `GPU_CHOICE`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
### Manual installation using Conda
@ -90,7 +92,7 @@ conda activate textgen
|--------|---------|---------|
| Linux/WSL | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
| Linux/WSL | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu` |
| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.1` |
| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.2.4` |
| MacOS + MPS | Any | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
| Windows | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
| Windows | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
@ -146,14 +148,14 @@ The `requirements*.txt` above contain various wheels precompiled through GitHub
For NVIDIA GPU:
ln -s docker/{nvidia/Dockerfile,nvidia/docker-compose.yml,.dockerignore} .
For AMD GPU:
ln -s docker/{amd/Dockerfile,intel/docker-compose.yml,.dockerignore} .
ln -s docker/{amd/Dockerfile,amd/docker-compose.yml,.dockerignore} .
For Intel GPU:
ln -s docker/{intel/Dockerfile,amd/docker-compose.yml,.dockerignore} .
For CPU only
ln -s docker/{cpu/Dockerfile,cpu/docker-compose.yml,.dockerignore} .
cp docker/.env.example .env
#Create logs/cache dir :
mkdir -p logs cache
mkdir -p user_data/logs user_data/cache
# Edit .env and set:
# TORCH_CUDA_ARCH_LIST based on your GPU model
# APP_RUNTIME_GID your host user's group id (run `id -g` in a terminal)
@ -187,13 +189,13 @@ usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--
[--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
[--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit]
[--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap]
[--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N]
[--mlock] [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N]
[--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
[--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner]
[--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
[--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
[--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--api] [--public-api]
[--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
[--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
[--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
[--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
[--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY]
[--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
Text generation web UI
@ -215,7 +217,7 @@ Basic settings:
--idle-timeout IDLE_TIMEOUT Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
Model loader:
--loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ,
--loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2,
TensorRT-LLM.
Transformers/Accelerate:
@ -246,16 +248,18 @@ llama.cpp:
--batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval.
--no-mmap Prevent mmap from being used.
--mlock Force the system to keep the model in RAM.
--n-gpu-layers N_GPU_LAYERS Number of layers to offload to the GPU.
--gpu-layers N, --n-gpu-layers N Number of layers to offload to the GPU.
--tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
--numa Activate NUMA task allocation for llama.cpp.
--no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
--row-split Split the model by rows across GPUs. This may improve multi-gpu performance.
--extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"
--extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"
--streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
Context and cache management:
Context and cache:
--ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens.
--cache-type N, --cache_type N KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits
separately, e.g. q4_q8).
Speculative decoding:
--model-draft MODEL_DRAFT Path to the draft model for speculative decoding.
@ -274,15 +278,9 @@ ExLlamaV2:
--num_experts_per_token N Number of experts to use for generation. Applies to MoE models like Mixtral.
--enable_tp Enable Tensor Parallelism (TP) in ExLlamaV2.
HQQ:
--hqq-backend HQQ_BACKEND Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
TensorRT-LLM:
--cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
Cache:
--cache_type CACHE_TYPE KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
DeepSpeed:
--deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
--nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
@ -305,6 +303,7 @@ Gradio:
--ssl-certfile SSL_CERTFILE The path to the SSL certificate cert file.
--subpath SUBPATH Customize the subpath for gradio, use with reverse proxy
--old-colors Use the legacy Gradio colors, before the December/2024 update.
--portable Hide features not available in portable mode like training.
API:
--api Enable the API extension.

View file

@ -1,7 +1,9 @@
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 28px;
padding-bottom: 22px;
padding-top: 6px;
font-size: 18px;
font-family: Roboto, Arial, sans-serif; /* Modern font */
line-height: 1.5;
@ -102,6 +104,7 @@
@media screen and (width <= 688px) {
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 25px;
font-size: 15px;

View file

@ -2,8 +2,10 @@
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 28px;
padding-bottom: 21px;
padding-top: 7px;
font-size: 18px;
font-family: 'Noto Sans', Arial, sans-serif;
line-height: 1.428571429;
@ -100,6 +102,7 @@
@media screen and (width <= 688px) {
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 25px;
font-size: 15px;

View file

@ -16,6 +16,7 @@
}
.message {
padding-bottom: 2em;
padding-bottom: 1.5em;
padding-top: 0.5em;
grid-template-columns: 70px minmax(0, 1fr);
}

View file

@ -1,7 +1,9 @@
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 2em;
padding-bottom: 1.5em;
padding-top: 0.5em;
font-size: 15px;
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
line-height: 22.5px !important;

View file

@ -1,5 +1,6 @@
.message {
padding-bottom: 25px;
padding-bottom: 22px;
padding-top: 3px;
font-size: 15px;
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
line-height: 1.428571429;

View file

@ -1,5 +1,6 @@
.message {
padding-bottom: 25px;
padding-bottom: 22px;
padding-top: 3px;
font-size: 15px;
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
line-height: 1.428571429;

View file

@ -8,10 +8,6 @@
padding-top: 0 !important;
}
.chat > .messages > :last-child {
margin-bottom: 1.7rem !important;
}
.chat .message-body p, .chat .message-body li {
font-size: 1rem !important;
line-height: 28px !important;
@ -46,7 +42,7 @@
}
.chat .user-message {
background: #f5f5f5;
background: #f3f4f6;
padding: 1.5rem 1rem;
padding-bottom: 2rem;
border-radius: 0;
@ -61,16 +57,16 @@
}
.dark .chat .user-message {
background: transparent;
background: var(--light-gray);
}
.dark .chat .assistant-message {
background: var(--light-gray);
background: transparent;
}
.chat .user-message .text,
.chat .assistant-message .text {
max-width: 645px;
max-width: 700px;
margin-left: auto;
margin-right: auto;
}

View file

@ -1,11 +1,11 @@
:root {
--darker-gray: #202123;
--dark-gray: #343541;
--light-gray: #444654;
--light-theme-gray: #f5f5f5;
--dark-gray: #2A2B32;
--light-gray: #373943;
--light-theme-gray: #f9fbff;
--border-color-dark: #525252;
--header-width: 112px;
--selected-item-color-dark: #32333e;
--selected-item-color-dark: #2E2F38;
}
@font-face {
@ -131,7 +131,7 @@ gradio-app > :first-child {
}
.header_bar {
box-shadow: 0 0 3px rgba(22 22 22 / 35%);
border-right: var(--input-border-width) solid var(--input-border-color);
margin-bottom: 0;
overflow-x: scroll;
text-wrap: nowrap;
@ -265,7 +265,7 @@ button {
.dark .pretty_scrollbar::-webkit-scrollbar-thumb,
.dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
background: #ccc;
background: rgb(255 255 255 / 10%);
border-radius: 10px;
}
@ -389,8 +389,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
.chat {
margin-left: auto;
margin-right: auto;
min-height: var(--chat-height);
overflow-y: auto;
flex: 1;
overflow-y: hidden;
display: flex;
flex-direction: column;
word-break: break-word;
@ -401,10 +401,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
.chat-parent {
height: calc(100dvh - 98px - var(--input-delta));
flex: 1;
overflow: auto !important;
border-radius: 0 !important;
margin-bottom: var(--input-delta) !important;
}
.chat-parent .prose {
@ -420,14 +419,22 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
padding-right: 1rem;
}
.chat .message .timestamp {
font-size: 0.7em;
display: inline-block;
font-weight: normal;
opacity: 0.7;
margin-left: 5px;
}
.chat-parent.bigchat {
height: calc(100dvh - 98px - var(--input-delta)) !important;
margin-bottom: var(--input-delta) !important;
flex: 1;
}
.chat > .messages {
display: flex;
flex-direction: column;
min-height: calc(100vh - 102px);
}
.chat > .messages > :first-child {
@ -546,7 +553,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
border-radius: 5px;
font-size: 82%;
padding: 1px 3px;
background: white !important;
background: #f3f4f6 !important;
color: #1f2328;
}
@ -560,18 +567,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
padding: 15px;
}
.message-body :not(pre) > code::before {
content: "`";
}
.message-body :not(pre) > code::after {
content: "`";
}
.message-body :not(pre) > code {
white-space: normal !important;
font-weight: bold;
font-family: unset;
font-size: 0.95em;
font-family: Menlo,"Roboto Mono","Courier New",Courier,monospace,Inter,sans-serif;
padding: .15rem .3rem;
background-color: #ececec;
}
.dark .message-body :not(pre) > code {
background-color: rgb(255 255 255 / 10%);
}
#chat-input {
@ -582,7 +588,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
#chat-input textarea {
background: #f3f4f6;
padding: 0.65rem 2.5rem;
border: 0;
box-shadow: 0;
border-radius: 8px;
}
#chat-input textarea::placeholder {
@ -602,9 +612,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
display: none;
}
#chat-input .submit-button {
display: none;
}
#chat-input .upload-button {
margin-right: 16px;
margin-bottom: 7px;
background: transparent;
}
.chat-input-positioned {
position: absolute;
bottom: 0;
max-width: 54rem;
left: 50%;
transform: translateX(-50%);
@ -744,7 +762,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
.hover-menu button {
width: 100%;
background: transparent !important;
background: white !important;
border-radius: 0 !important;
justify-content: space-between;
margin: 0 !important;
@ -760,7 +778,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
.hover-menu button:hover {
background: var(--button-secondary-background-fill-hover) !important;
background: #dbeafe !important;
}
.dark .hover-menu button:hover {
background: var(--selected-item-color-dark) !important;
}
.transparent-substring {
@ -789,6 +811,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
#chat-input-container {
display: flex;
flex-direction: column;
min-width: 0 !important;
}
@ -798,9 +822,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
#chat-input-row {
padding-bottom: 1.5em;
padding-left: 1rem;
padding-right: 1rem;
padding: 1rem;
padding-top: 0;
}
#chat-input-row.bigchat {
@ -808,27 +831,22 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
#chat-col {
padding-bottom: 100px;
height: 100dvh;
display: flex;
flex-direction: column;
padding-bottom: 0;
gap: 0;
}
@media screen and (width <= 924px) {
#chat-col {
padding-bottom: 100px;
margin-top: 32px;
position: relative; /* Ensure positioning for the pseudo-element */
}
.chat-parent {
height: calc(100dvh - 98px - var(--input-delta) - 32px);
}
.chat-parent.bigchat {
height: calc(100dvh - 98px - var(--input-delta) - 32px) !important;
height: calc(100dvh - 32px);
}
}
#chat-col.bigchat {
padding-bottom: 80px !important;
padding-bottom: 15px !important;
}
.message-body ol, .message-body ul {
@ -985,6 +1003,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
cursor: pointer;
}
#past-chats .selected,
#past-chats label:hover {
background-color: #dbeafe !important;
}
#past-chats-buttons,
#delete-chat-row,
#rename-row {
@ -993,7 +1016,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
gap: 9px;
}
#past-chats-row,
#chat-controls {
width: 260px;
@ -1111,12 +1133,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
color: #9ca3af;
}
.dark .hover-menu {
background-color: var(--darker-gray);
}
.dark .hover-menu button {
border-color: var(--border-color-primary);
background-color: var(--darker-gray) !important;
}
.dark #chat-controls,
@ -1125,8 +1144,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
border: 0 !important;
}
.dark #past-chats .selected,
.dark #past-chats label:hover {
.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats .selected,
.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats label:hover {
background-color: var(--selected-item-color-dark) !important;
}
@ -1163,7 +1182,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
.header_bar button.selected {
background: #E0E0E0;
background: #dbeafe;
}
#chat-controls,
@ -1171,11 +1190,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
background-color: var(--light-theme-gray);
}
#chat-controls {
.dark #chat-controls {
border-left: 1px solid #d9d9d0;
}
#past-chats-row {
.dark #past-chats-row {
border-right: 1px solid #d9d9d0;
}
@ -1236,42 +1255,31 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
position: relative;
}
.footer-button {
/* New container for the buttons */
.message-actions {
position: absolute;
bottom: -23px;
left: 0;
display: flex;
gap: 5px;
opacity: 0;
transition: opacity 0.2s;
}
.footer-button {
padding: 0;
margin: 0;
border: none;
border-radius: 3px;
cursor: pointer;
opacity: 0;
display: flex;
align-items: center;
transition: opacity 0.2s;
justify-content: center;
}
.footer-button.footer-copy-button {
bottom: -23px;
left: 0;
}
.footer-button.footer-refresh-button {
bottom: -23px;
left: 25px;
}
.footer-button.footer-continue-button {
bottom: -23px;
left: 50px;
}
.footer-button.footer-remove-button {
bottom: -23px;
left: 75px;
}
.message:hover .footer-button,
.user-message:hover .footer-button,
.assistant-message:hover .footer-button {
.message:hover .message-actions,
.user-message:hover .message-actions,
.assistant-message:hover .message-actions {
opacity: 1;
}
@ -1362,6 +1370,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
contain: layout;
}
.chat .message-body .thinking-content p,
.chat .message-body .thinking-content li {
font-size: 15px !important;
}
/* Animation for opening thinking blocks */
@keyframes fadeIn {
from { opacity: 0; }
@ -1382,3 +1395,163 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
50% { opacity: 1; }
100% { opacity: 0.6; }
}
strong {
font-weight: bold;
}
.min.svelte-1ybaih5 {
min-height: 0;
}
#vram-info .value {
color: #008d00;
}
.dark #vram-info .value {
color: #07ff07;
}
.message-attachments {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-top: 8px;
padding-bottom: 6px;
}
.attachment-box {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
padding: 8px;
background: rgb(0 0 0 / 5%);
border-radius: 6px;
border: 1px solid rgb(0 0 0 / 10%);
min-width: 80px;
max-width: 120px;
}
.attachment-icon {
margin-bottom: 4px;
color: #555;
}
.attachment-name {
font-size: 0.8em;
text-align: center;
word-break: break-word;
overflow: hidden;
text-overflow: ellipsis;
display: -webkit-box;
-webkit-line-clamp: 2;
-webkit-box-orient: vertical;
}
.dark .attachment-box {
background: rgb(255 255 255 / 5%);
border: 1px solid rgb(255 255 255 / 10%);
}
.dark .attachment-icon {
color: #ccc;
}
/* Message Editing Styles */
.editing-textarea {
width: 100%;
min-height: 200px;
max-height: 65vh;
padding: 10px;
border-radius: 5px;
border: 1px solid #ccc;
background-color: var(--light-theme-gray);
font-family: inherit;
font-size: inherit;
resize: vertical;
}
.dark .editing-textarea {
border: 1px solid var(--border-color-dark);
background-color: var(--darker-gray);
}
.editing-textarea:focus {
outline: none;
border-color: var(--selected-item-color-dark);
}
.edit-controls-container {
margin-top: 0;
display: flex;
gap: 8px;
padding-bottom: 8px;
}
.edit-control-button {
padding: 6px 12px;
border: 1px solid #ccc;
border-radius: 4px;
cursor: pointer;
background-color: #f8f9fa;
color: #212529;
font-size: 12px;
margin: 0;
}
.dark .edit-control-button {
border: 1px solid var(--border-color-dark);
background-color: var(--light-gray);
color: #efefef;
}
/* --- Simple Version Navigation --- */
.version-navigation {
position: absolute;
bottom: -23px;
right: 0;
display: flex;
align-items: center;
gap: 5px;
opacity: 0;
transition: opacity 0.2s;
}
.message:hover .version-navigation,
.user-message:hover .version-navigation,
.assistant-message:hover .version-navigation {
opacity: 1;
}
.version-nav-button {
padding: 2px 6px;
font-size: 12px;
min-width: auto;
}
.version-nav-button[disabled] {
opacity: 0.3;
cursor: not-allowed;
}
.version-position {
font-size: 11px;
color: currentcolor;
font-family: monospace;
min-width: 35px;
text-align: center;
opacity: 0.8;
user-select: none;
}
.token-display {
font-family: monospace;
font-size: 13px;
color: var(--body-text-color-subdued);
margin-top: 4px;
}
button:focus {
outline: none;
}

View file

@ -14,7 +14,7 @@ WORKDIR /home/app/
RUN git clone https://github.com/oobabooga/text-generation-webui.git
WORKDIR /home/app/text-generation-webui
RUN GPU_CHOICE=B LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
COPY CMD_FLAGS.txt /home/app/text-generation-webui/
COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
WORKDIR /home/app/text-generation-webui
# set umask to ensure group read / write at runtime

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972}
APP_UID: ${APP_UID:-6972}
env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports:
@ -41,14 +41,4 @@ services:
security_opt:
- seccomp=unconfined
volumes:
- ./cache:/home/app/text-generation-webui/cache
- ./characters:/home/app/text-generation-webui/characters
- ./extensions:/home/app/text-generation-webui/extensions
- ./loras:/home/app/text-generation-webui/loras
- ./logs:/home/app/text-generation-webui/logs
- ./models:/home/app/text-generation-webui/models
- ./presets:/home/app/text-generation-webui/presets
- ./prompts:/home/app/text-generation-webui/prompts
- ./softprompts:/home/app/text-generation-webui/softprompts
- ./training:/home/app/text-generation-webui/training
- ./cloudflared:/etc/cloudflared
- ./user_data:/home/app/text-generation-webui/user_data

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972}
APP_UID: ${APP_UID:-6972}
env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports:

View file

@ -14,7 +14,7 @@ WORKDIR /home/app/
RUN git clone https://github.com/oobabooga/text-generation-webui.git
WORKDIR /home/app/text-generation-webui
RUN GPU_CHOICE=D LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
COPY CMD_FLAGS.txt /home/app/text-generation-webui/
COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
# set umask to ensure group read / write at runtime
WORKDIR /home/app/text-generation-webui

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972}
APP_UID: ${APP_UID:-6972}
env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports:
@ -41,12 +41,4 @@ services:
security_opt:
- seccomp=unconfined
volumes:
- ./characters:/home/app/text-generation-webui/characters
- ./extensions:/home/app/text-generation-webui/extensions
- ./loras:/home/app/text-generation-webui/loras
- ./models:/home/app/text-generation-webui/models
- ./presets:/home/app/text-generation-webui/presets
- ./prompts:/home/app/text-generation-webui/prompts
- ./softprompts:/home/app/text-generation-webui/softprompts
- ./training:/home/app/text-generation-webui/training
- ./cloudflared:/etc/cloudflared
- ./user_data:/home/app/text-generation-webui/user_data

View file

@ -14,7 +14,7 @@ WORKDIR /home/app/
RUN git clone https://github.com/oobabooga/text-generation-webui.git
WORKDIR /home/app/text-generation-webui
RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
COPY CMD_FLAGS.txt /home/app/text-generation-webui/
COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
WORKDIR /home/app/text-generation-webui
# set umask to ensure group read / write at runtime

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972}
APP_UID: ${APP_UID:-6972}
env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports:
@ -31,17 +31,7 @@ services:
stdin_open: true
tty: true
volumes:
- ./cache:/home/app/text-generation-webui/cache
- ./characters:/home/app/text-generation-webui/characters
- ./extensions:/home/app/text-generation-webui/extensions
- ./loras:/home/app/text-generation-webui/loras
- ./logs:/home/app/text-generation-webui/logs
- ./models:/home/app/text-generation-webui/models
- ./presets:/home/app/text-generation-webui/presets
- ./prompts:/home/app/text-generation-webui/prompts
- ./softprompts:/home/app/text-generation-webui/softprompts
- ./training:/home/app/text-generation-webui/training
- ./cloudflared:/etc/cloudflared
- ./user_data:/home/app/text-generation-webui/user_data
deploy:
resources:
reservations:

View file

@ -257,6 +257,85 @@ headers = {
in any of the examples above.
#### Tool/Function Calling Example
You need to use a model with tools support. The prompt will be automatically formatted using the model's Jinja2 template.
Request:
```
curl http://127.0.0.1:5000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What time is it currently in New York City?"
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_time",
"description": "Get current time in a specific timezones",
"parameters": {
"type": "object",
"required": ["timezone"],
"properties": {
"timezone": {
"type": "string",
"description": "IANA timezone name (e.g., America/New_York, Europe/London). Use Europe/Berlin as local timezone if no timezone provided by the user."
}
}
}
}
}
]
}'
```
Sample response:
```
{
"id": "chatcmpl-1746532051477984256",
"object": "chat.completion",
"created": 1746532051,
"model": "qwen2.5-coder-14b-instruct-q4_k_m.gguf",
"choices": [
{
"index": 0,
"finish_reason": "tool_calls",
"message": {
"role": "assistant",
"content": "```xml\n<function>\n{\n \"name\": \"get_current_time\",\n \"arguments\": {\n \"timezone\": \"America/New_York\"\n }\n}\n</function>\n```"
},
"tool_calls": [
{
"type": "function",
"function": {
"name": "get_current_time",
"arguments": "{\"timezone\": \"America/New_York\"}"
},
"id": "call_52ij07mh",
"index": "0"
}
]
}
],
"usage": {
"prompt_tokens": 224,
"completion_tokens": 38,
"total_tokens": 262
}
}
```
### Environment variables
The following environment variables can be used (they take precedence over everything else):

View file

@ -1,16 +1,14 @@
import base64
import copy
import re
import json
import time
from collections import deque
from io import BytesIO
import requests
import tiktoken
from PIL import Image
from pydantic import ValidationError
from extensions.openai.errors import InvalidRequestError
from extensions.openai.utils import debug_msg
from extensions.openai.typing import ToolDefinition
from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
from modules import shared
from modules.chat import (
generate_chat_prompt,
@ -96,72 +94,32 @@ def convert_history(history):
user_input_last = True
system_message = ""
# Multimodal: convert OpenAI format to multimodal extension format
if any('content' in entry and isinstance(entry['content'], list) for entry in history):
new_history = []
for entry in history:
if isinstance(entry['content'], list):
for item in entry['content']:
if not isinstance(item, dict):
continue
image_url = None
content = None
if item['type'] == 'image_url' and isinstance(item['image_url'], dict):
image_url = item['image_url']['url']
elif item['type'] == 'text' and isinstance(item['text'], str):
content = item['text']
if image_url:
new_history.append({"image_url": image_url, "role": "user"})
if content:
new_history.append({"content": content, "role": "user"})
else:
new_history.append(entry)
history = new_history
for entry in history:
if "image_url" in entry:
image_url = entry['image_url']
if "base64" in image_url:
image_url = re.sub('^data:image/.+;base64,', '', image_url)
img = Image.open(BytesIO(base64.b64decode(image_url)))
else:
try:
my_res = requests.get(image_url)
img = Image.open(BytesIO(my_res.content))
except Exception:
raise 'Image cannot be loaded from the URL!'
buffered = BytesIO()
if img.mode in ("RGBA", "P"):
img = img.convert("RGB")
img.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
content = f'<img src="data:image/jpeg;base64,{img_str}">'
else:
content = entry["content"]
role = entry["role"]
if role == "user":
user_input = content
user_input_last = True
if current_message:
chat_dialogue.append([current_message, ''])
chat_dialogue.append([current_message, '', ''])
current_message = ""
current_message = content
elif role == "assistant":
if "tool_calls" in entry and isinstance(entry["tool_calls"], list) and len(entry["tool_calls"]) > 0 and content.strip() == "":
continue # skip tool calls
current_reply = content
user_input_last = False
if current_message:
chat_dialogue.append([current_message, current_reply])
chat_dialogue.append([current_message, current_reply, ''])
current_message = ""
current_reply = ""
else:
chat_dialogue.append(['', current_reply])
chat_dialogue.append(['', current_reply, ''])
elif role == "tool":
user_input_last = False
chat_dialogue.append(['', '', content])
elif role == "system":
system_message += f"\n{content}" if system_message else content
@ -181,6 +139,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
if 'messages' not in body:
raise InvalidRequestError(message="messages is required", param='messages')
tools = None
if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and len(body['tools']) > 0:
tools = validateTools(body['tools']) # raises InvalidRequestError if validation fails
messages = body['messages']
for m in messages:
if 'role' not in m:
@ -238,6 +200,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
'custom_system_message': custom_system_message,
'chat_template_str': chat_template_str,
'chat-instruct_command': chat_instruct_command,
'tools': tools,
'history': history,
'stream': stream
})
@ -250,7 +213,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
requested_model = generate_params.pop('model')
logprob_proc = generate_params.pop('logprob_proc', None)
def chat_streaming_chunk(content):
def chat_streaming_chunk(content, chunk_tool_calls=None):
# begin streaming
chunk = {
"id": cmpl_id,
@ -260,7 +223,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
resp_list: [{
"index": 0,
"finish_reason": None,
"delta": {'role': 'assistant', 'content': content},
"delta": {'role': 'assistant', 'content': content, 'tool_calls': chunk_tool_calls},
}],
}
@ -269,6 +232,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
# else:
# chunk[resp_list][0]["logprobs"] = None
return chunk
# generate reply #######################################
@ -277,8 +241,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
yield {'prompt': prompt}
return
debug_msg({'prompt': prompt, 'generate_params': generate_params})
if stream:
yield chat_streaming_chunk('')
@ -288,8 +250,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
answer = ''
seen_content = ''
tool_calls = []
end_last_tool_call = 0
supported_tools = [x["function"]["name"] for x in tools] if tools is not None else None
for a in generator:
answer = a['internal'][-1][1]
if supported_tools is not None:
tool_call = parseToolCall(answer[end_last_tool_call:], supported_tools) if len(answer) > 0 else []
if len(tool_call) > 0:
for tc in tool_call:
tc["id"] = getToolCallId()
tc["index"] = str(len(tool_calls))
tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"])
tool_calls.append(tc)
end_last_tool_call = len(answer)
if stream:
len_seen = len(seen_content)
new_content = answer[len_seen:]
@ -297,18 +274,25 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet.
continue
seen_content = answer
chunk = chat_streaming_chunk(new_content)
seen_content = answer
yield chunk
# stop generation if tool_calls were generated previously
if len(tool_calls) > 0:
break
token_count = len(encode(prompt)[0])
completion_token_count = len(encode(answer)[0])
stop_reason = "stop"
if len(tool_calls) > 0:
stop_reason = "tool_calls"
if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']:
stop_reason = "length"
if stream:
chunk = chat_streaming_chunk('')
chunk = chat_streaming_chunk('', tool_calls)
chunk[resp_list][0]['finish_reason'] = stop_reason
chunk['usage'] = {
"prompt_tokens": token_count,
@ -326,7 +310,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
resp_list: [{
"index": 0,
"finish_reason": stop_reason,
"message": {"role": "assistant", "content": answer}
"message": {"role": "assistant", "content": answer},
"tool_calls": tool_calls
}],
"usage": {
"prompt_tokens": token_count,
@ -515,3 +500,19 @@ def completions(body: dict, is_legacy: bool = False) -> dict:
def stream_completions(body: dict, is_legacy: bool = False):
for resp in completions_common(body, is_legacy, stream=True):
yield resp
def validateTools(tools: list[dict]):
# Validate each tool definition in the JSON array
valid_tools = None
for idx in range(len(tools)):
tool = tools[idx]
try:
tool_definition = ToolDefinition(**tool)
if valid_tools is None:
valid_tools = []
valid_tools.append(tool)
except ValidationError:
raise InvalidRequestError(message=f"Invalid tool specification at index {idx}.", param='tools')
return valid_tools

View file

@ -14,6 +14,7 @@ from fastapi.requests import Request
from fastapi.responses import JSONResponse
from pydub import AudioSegment
from sse_starlette import EventSourceResponse
from starlette.concurrency import iterate_in_threadpool
import extensions.openai.completions as OAIcompletions
import extensions.openai.images as OAIimages
@ -114,18 +115,28 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
if request_data.stream:
async def generator():
async with streaming_semaphore:
try:
response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
for resp in response:
async for resp in iterate_in_threadpool(response):
disconnected = await request.is_disconnected()
if disconnected:
break
yield {"data": json.dumps(resp)}
finally:
stop_everything_event()
response.close()
return
return EventSourceResponse(generator()) # SSE streaming
else:
response = OAIcompletions.completions(to_dict(request_data), is_legacy=is_legacy)
response = await asyncio.to_thread(
OAIcompletions.completions,
to_dict(request_data),
is_legacy=is_legacy
)
return JSONResponse(response)
@ -137,18 +148,28 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
if request_data.stream:
async def generator():
async with streaming_semaphore:
try:
response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
for resp in response:
async for resp in iterate_in_threadpool(response):
disconnected = await request.is_disconnected()
if disconnected:
break
yield {"data": json.dumps(resp)}
finally:
stop_everything_event()
response.close()
return
return EventSourceResponse(generator()) # SSE streaming
else:
response = OAIcompletions.chat_completions(to_dict(request_data), is_legacy=is_legacy)
response = await asyncio.to_thread(
OAIcompletions.chat_completions,
to_dict(request_data),
is_legacy=is_legacy
)
return JSONResponse(response)
@ -436,7 +457,7 @@ def run_server():
# Start server
logging.getLogger("uvicorn.error").propagate = False
uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False)
def setup():

View file

@ -1,8 +1,8 @@
import json
import time
from typing import Dict, List
from typing import Dict, List, Optional
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, validator
class GenerationOptions(BaseModel):
@ -54,6 +54,48 @@ class GenerationOptions(BaseModel):
grammar_string: str = ""
class ToolDefinition(BaseModel):
function: 'ToolFunction'
type: str
class ToolFunction(BaseModel):
description: str
name: str
parameters: 'ToolParameters'
class ToolParameters(BaseModel):
properties: Optional[Dict[str, 'ToolProperty']] = None
required: Optional[list[str]] = None
type: str
description: Optional[str] = None
class ToolProperty(BaseModel):
description: Optional[str] = None
type: Optional[str] = None # we are faced with definitions like anyOf, e.g. {'type': 'function', 'function': {'name': 'git_create_branch', 'description': 'Creates a new branch from an optional base branch', 'parameters': {'type': 'object', 'properties': {'repo_path': {'title': 'Repo Path', 'type': 'string'}, 'branch_name': {'title': 'Branch Name', 'type': 'string'}, 'base_branch': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'default': None, 'title': 'Base Branch'}}, 'required': ['repo_path', 'branch_name'], 'title': 'GitCreateBranch'}}}
class FunctionCall(BaseModel):
name: str
arguments: Optional[str] = None
parameters: Optional[str] = None
@validator('arguments', allow_reuse=True)
def checkPropertyArgsOrParams(cls, v, values, **kwargs):
if not v and not values.get('parameters'):
raise ValueError("At least one of 'arguments' or 'parameters' must be provided as property in FunctionCall type")
return v
class ToolCall(BaseModel):
id: str
index: int
type: str
function: FunctionCall
class CompletionRequestParams(BaseModel):
model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.")
prompt: str | List[str]
@ -92,6 +134,7 @@ class ChatCompletionRequestParams(BaseModel):
frequency_penalty: float | None = 0
function_call: str | dict | None = Field(default=None, description="Unused parameter.")
functions: List[dict] | None = Field(default=None, description="Unused parameter.")
tools: List[dict] | None = Field(default=None, description="Tools signatures passed via MCP.")
logit_bias: dict | None = None
max_tokens: int | None = None
n: int | None = Field(default=1, description="Unused parameter.")

View file

@ -1,5 +1,8 @@
import base64
import json
import os
import random
import re
import time
import traceback
from typing import Callable, Optional
@ -52,3 +55,94 @@ def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_star
time.sleep(3)
raise Exception('Could not start cloudflared.')
def getToolCallId() -> str:
letter_bytes = "abcdefghijklmnopqrstuvwxyz0123456789"
b = [random.choice(letter_bytes) for _ in range(8)]
return "call_" + "".join(b).lower()
def checkAndSanitizeToolCallCandidate(candidate_dict: dict, tool_names: list[str]):
# check if property 'function' exists and is a dictionary, otherwise adapt dict
if 'function' not in candidate_dict and 'name' in candidate_dict and isinstance(candidate_dict['name'], str):
candidate_dict = {"type": "function", "function": candidate_dict}
if 'function' in candidate_dict and isinstance(candidate_dict['function'], str):
candidate_dict['name'] = candidate_dict['function']
del candidate_dict['function']
candidate_dict = {"type": "function", "function": candidate_dict}
if 'function' in candidate_dict and isinstance(candidate_dict['function'], dict):
# check if 'name' exists within 'function' and is part of known tools
if 'name' in candidate_dict['function'] and candidate_dict['function']['name'] in tool_names:
candidate_dict["type"] = "function" # ensure required property 'type' exists and has the right value
# map property 'parameters' used by some older models to 'arguments'
if "arguments" not in candidate_dict["function"] and "parameters" in candidate_dict["function"]:
candidate_dict["function"]["arguments"] = candidate_dict["function"]["parameters"]
del candidate_dict["function"]["parameters"]
return candidate_dict
return None
def parseToolCall(answer: str, tool_names: list[str]):
matches = []
# abort on very short answers to save computation cycles
if len(answer) < 10:
return matches
# Define the regex pattern to find the JSON content wrapped in <function>, <tools>, <tool_call>, and other tags observed from various models
patterns = [r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)</\1>"]
for pattern in patterns:
for match in re.finditer(pattern, answer, re.DOTALL):
# print(match.group(2))
if match.group(2) is None:
continue
# remove backtick wraps if present
candidate = re.sub(r"^```(json|xml|python[^\n]*)\n", "", match.group(2).strip())
candidate = re.sub(r"```$", "", candidate.strip())
# unwrap inner tags
candidate = re.sub(pattern, r"\2", candidate.strip(), flags=re.DOTALL)
# llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
if re.search(r"\}\s*\n\s*\{", candidate) is not None:
candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
if not candidate.strip().startswith("["):
candidate = "[" + candidate + "]"
candidates = []
try:
# parse the candidate JSON into a dictionary
candidates = json.loads(candidate)
if not isinstance(candidates, list):
candidates = [candidates]
except json.JSONDecodeError:
# Ignore invalid JSON silently
continue
for candidate_dict in candidates:
checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
if checked_candidate is not None:
matches.append(checked_candidate)
# last resort if nothing has been mapped: LLM might have produced plain json tool call without xml-like tags
if len(matches) == 0:
try:
candidate = answer
# llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
if re.search(r"\}\s*\n\s*\{", candidate) is not None:
candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
if not candidate.strip().startswith("["):
candidate = "[" + candidate + "]"
# parse the candidate JSON into a dictionary
candidates = json.loads(candidate)
if not isinstance(candidates, list):
candidates = [candidates]
for candidate_dict in candidates:
checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
if checked_candidate is not None:
matches.append(checked_candidate)
except json.JSONDecodeError:
# Ignore invalid JSON silently
pass
return matches

View file

@ -1,10 +1,11 @@
import math
import random
import threading
import torch
import chromadb
import numpy as np
import posthog
import torch
from chromadb.config import Settings
from chromadb.utils import embedding_functions
@ -292,6 +293,8 @@ class ChromaCollector():
for doc in documents:
doc_tokens = encode(doc)[0]
if isinstance(doc_tokens, np.ndarray):
doc_tokens = doc_tokens.tolist()
doc_token_count = len(doc_tokens)
if current_token_count + doc_token_count > max_token_count:
# If adding this document would exceed the max token count,

View file

@ -1,3 +1,7 @@
// -------------------------------------------------
// Event handlers
// -------------------------------------------------
function copyToClipboard(element) {
if (!element) return;
@ -18,6 +22,201 @@ function copyToClipboard(element) {
});
}
function branchHere(element) {
if (!element) return;
const messageElement = element.closest(".message, .user-message, .assistant-message");
if (!messageElement) return;
const index = messageElement.getAttribute("data-index");
if (!index) return;
const branchIndexInput = document.getElementById("Branch-index").querySelector("input");
if (!branchIndexInput) {
console.error("Element with ID 'Branch-index' not found.");
return;
}
const branchButton = document.getElementById("Branch");
if (!branchButton) {
console.error("Required element 'Branch' not found.");
return;
}
branchIndexInput.value = index;
// Trigger any 'change' or 'input' events Gradio might be listening for
const event = new Event("input", { bubbles: true });
branchIndexInput.dispatchEvent(event);
branchButton.click();
}
// -------------------------------------------------
// Message Editing Functions
// -------------------------------------------------
function editHere(buttonElement) {
if (!buttonElement) return;
const messageElement = buttonElement.closest(".message, .user-message, .assistant-message");
if (!messageElement) return;
const messageBody = messageElement.querySelector(".message-body");
if (!messageBody) return;
// If already editing, focus the textarea
const existingTextarea = messageBody.querySelector(".editing-textarea");
if (existingTextarea) {
existingTextarea.focus();
return;
}
// Determine role based on message element - handle different chat modes
const isUserMessage = messageElement.classList.contains("user-message") ||
messageElement.querySelector(".text-you") !== null ||
messageElement.querySelector(".circle-you") !== null;
startEditing(messageElement, messageBody, isUserMessage);
}
function startEditing(messageElement, messageBody, isUserMessage) {
const rawText = messageElement.getAttribute("data-raw") || messageBody.textContent;
const originalHTML = messageBody.innerHTML;
// Create editing interface
const editingInterface = createEditingInterface(rawText);
// Replace message content
messageBody.innerHTML = "";
messageBody.appendChild(editingInterface.textarea);
messageBody.appendChild(editingInterface.controls);
editingInterface.textarea.focus();
editingInterface.textarea.setSelectionRange(rawText.length, rawText.length);
// Setup event handlers
setupEditingHandlers(editingInterface.textarea, messageElement, originalHTML, messageBody, isUserMessage);
}
function createEditingInterface(text) {
const textarea = document.createElement("textarea");
textarea.value = text;
textarea.className = "editing-textarea";
textarea.rows = Math.max(3, text.split("\n").length);
const controls = document.createElement("div");
controls.className = "edit-controls-container";
const saveButton = document.createElement("button");
saveButton.textContent = "Save";
saveButton.className = "edit-control-button";
saveButton.type = "button";
const cancelButton = document.createElement("button");
cancelButton.textContent = "Cancel";
cancelButton.className = "edit-control-button edit-cancel-button";
cancelButton.type = "button";
controls.appendChild(saveButton);
controls.appendChild(cancelButton);
return { textarea, controls, saveButton, cancelButton };
}
function setupEditingHandlers(textarea, messageElement, originalHTML, messageBody, isUserMessage) {
const saveButton = messageBody.querySelector(".edit-control-button:not(.edit-cancel-button)");
const cancelButton = messageBody.querySelector(".edit-cancel-button");
const submitEdit = () => {
const index = messageElement.getAttribute("data-index");
if (!index || !submitMessageEdit(index, textarea.value, isUserMessage)) {
cancelEdit();
}
};
const cancelEdit = () => {
messageBody.innerHTML = originalHTML;
};
// Event handlers
saveButton.onclick = submitEdit;
cancelButton.onclick = cancelEdit;
textarea.onkeydown = (e) => {
if (e.key === "Enter" && !e.shiftKey) {
e.preventDefault();
submitEdit();
} else if (e.key === "Escape") {
e.preventDefault();
cancelEdit();
}
};
}
function submitMessageEdit(index, newText, isUserMessage) {
const editIndexInput = document.getElementById("Edit-message-index")?.querySelector("input");
const editTextInput = document.getElementById("Edit-message-text")?.querySelector("textarea");
const editRoleInput = document.getElementById("Edit-message-role")?.querySelector("textarea");
const editButton = document.getElementById("Edit-message");
if (!editIndexInput || !editTextInput || !editRoleInput || !editButton) {
console.error("Edit elements not found");
return false;
}
editIndexInput.value = index;
editTextInput.value = newText;
editRoleInput.value = isUserMessage ? "user" : "assistant";
editIndexInput.dispatchEvent(new Event("input", { bubbles: true }));
editTextInput.dispatchEvent(new Event("input", { bubbles: true }));
editRoleInput.dispatchEvent(new Event("input", { bubbles: true }));
editButton.click();
return true;
}
function navigateVersion(element, direction) {
if (!element) return;
const messageElement = element.closest(".message, .user-message, .assistant-message");
if (!messageElement) return;
const index = messageElement.getAttribute("data-index");
if (!index) return;
// Determine role based on message element classes
let role = "assistant"; // Default role
if (messageElement.classList.contains("user-message") ||
messageElement.querySelector(".text-you") ||
messageElement.querySelector(".circle-you")) {
role = "user";
}
const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input");
const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea");
const roleInput = document.getElementById("Navigate-message-role")?.querySelector("textarea");
const navigateButton = document.getElementById("Navigate-version");
if (!indexInput || !directionInput || !roleInput || !navigateButton) {
console.error("Navigation control elements (index, direction, role, or button) not found.");
return;
}
indexInput.value = index;
directionInput.value = direction;
roleInput.value = role;
// Trigger 'input' events for Gradio to pick up changes
const event = new Event("input", { bubbles: true });
indexInput.dispatchEvent(event);
directionInput.dispatchEvent(event);
roleInput.dispatchEvent(event);
navigateButton.click();
}
function regenerateClick() {
document.getElementById("Regenerate").click();
}

View file

@ -1,3 +1,7 @@
// ------------------------------------------------
// Main
// ------------------------------------------------
let main_parent = document.getElementById("chat-tab").parentNode;
let extensions = document.getElementById("extensions");
@ -39,9 +43,24 @@ document.querySelector(".header_bar").addEventListener("click", function(event)
//------------------------------------------------
// Keyboard shortcuts
//------------------------------------------------
// --- Helper functions --- //
function isModifiedKeyboardEvent() {
return (event instanceof KeyboardEvent &&
event.shiftKey ||
event.ctrlKey ||
event.altKey ||
event.metaKey);
}
function isFocusedOnEditableTextbox() {
if (event.target.tagName === "INPUT" || event.target.tagName === "TEXTAREA") {
return !!event.target.value;
}
}
let previousTabId = "chat-tab-button";
document.addEventListener("keydown", function(event) {
// Stop generation on Esc pressed
if (event.key === "Escape") {
// Find the element with id 'stop' and click it
@ -49,10 +68,15 @@ document.addEventListener("keydown", function(event) {
if (stopButton) {
stopButton.click();
}
return;
}
if (!document.querySelector("#chat-tab").checkVisibility() ) {
return;
}
// Show chat controls on Ctrl + S
else if (event.ctrlKey && event.key == "s") {
if (event.ctrlKey && event.key == "s") {
event.preventDefault();
var showControlsElement = document.getElementById("show-controls");
@ -82,24 +106,29 @@ document.addEventListener("keydown", function(event) {
document.getElementById("Remove-last").click();
}
// Copy last on Ctrl + Shift + K
else if (event.ctrlKey && event.shiftKey && event.key === "K") {
event.preventDefault();
document.getElementById("Copy-last").click();
}
// Replace last on Ctrl + Shift + L
else if (event.ctrlKey && event.shiftKey && event.key === "L") {
event.preventDefault();
document.getElementById("Replace-last").click();
}
// Impersonate on Ctrl + Shift + M
else if (event.ctrlKey && event.shiftKey && event.key === "M") {
event.preventDefault();
document.getElementById("Impersonate").click();
}
// --- Simple version navigation --- //
if (!isFocusedOnEditableTextbox()) {
// Version navigation on Arrow keys (horizontal)
if (!isModifiedKeyboardEvent() && event.key === "ArrowLeft") {
event.preventDefault();
navigateLastAssistantMessage("left");
}
else if (!isModifiedKeyboardEvent() && event.key === "ArrowRight") {
event.preventDefault();
if (!navigateLastAssistantMessage("right")) {
// If can't navigate right (last version), regenerate
document.getElementById("Regenerate").click();
}
}
}
});
//------------------------------------------------
@ -132,8 +161,6 @@ targetElement.addEventListener("scroll", function() {
// Create a MutationObserver instance
const observer = new MutationObserver(function(mutations) {
updateCssProperties();
if (targetElement.classList.contains("_generating")) {
typing.parentNode.classList.add("visible-dots");
document.getElementById("stop").style.display = "flex";
@ -144,12 +171,24 @@ const observer = new MutationObserver(function(mutations) {
document.getElementById("Generate").style.display = "flex";
}
doSyntaxHighlighting();
if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) {
targetElement.scrollTop = targetElement.scrollHeight;
}
const chatElement = document.getElementById("chat");
if (chatElement && chatElement.getAttribute("data-mode") === "instruct") {
const messagesContainer = chatElement.querySelector(".messages");
const lastChild = messagesContainer?.lastElementChild;
const prevSibling = lastChild?.previousElementSibling;
if (lastChild && prevSibling) {
lastChild.style.setProperty("margin-bottom",
`max(0px, calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px))`,
"important"
);
}
}
});
// Configure the observer to watch for changes in the subtree and attributes
@ -436,38 +475,6 @@ const chatInput = document.querySelector("#chat-input textarea");
// Variables to store current dimensions
let currentChatInputHeight = chatInput.clientHeight;
// Update chat layout based on chat and input dimensions
function updateCssProperties() {
const chatInputHeight = chatInput.clientHeight;
// Check if the chat container is visible
if (chatContainer.clientHeight > 0) {
const chatContainerParentHeight = chatContainer.parentNode.clientHeight;
const newChatHeight = `${chatContainerParentHeight - chatInputHeight - 80}px`;
document.documentElement.style.setProperty("--chat-height", newChatHeight);
document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`);
// Adjust scrollTop based on input height change
if (chatInputHeight !== currentChatInputHeight) {
const deltaHeight = chatInputHeight - currentChatInputHeight;
if (!isScrolled && deltaHeight < 0) {
chatContainer.scrollTop = chatContainer.scrollHeight;
} else {
chatContainer.scrollTop += deltaHeight;
}
currentChatInputHeight = chatInputHeight;
}
}
}
// Observe textarea size changes and call update function
new ResizeObserver(updateCssProperties).observe(document.querySelector("#chat-input textarea"));
// Handle changes in window size
window.addEventListener("resize", updateCssProperties);
//------------------------------------------------
// Focus on the rename text area when it becomes visible
//------------------------------------------------
@ -813,3 +820,55 @@ function createMobileTopBar() {
}
createMobileTopBar();
//------------------------------------------------
// Simple Navigation Functions
//------------------------------------------------
function navigateLastAssistantMessage(direction) {
const chat = document.querySelector("#chat");
if (!chat) return false;
const messages = chat.querySelectorAll("[data-index]");
if (messages.length === 0) return false;
// Find the last assistant message (starting from the end)
let lastAssistantMessage = null;
for (let i = messages.length - 1; i >= 0; i--) {
const msg = messages[i];
if (
msg.classList.contains("assistant-message") ||
msg.querySelector(".circle-bot") ||
msg.querySelector(".text-bot")
) {
lastAssistantMessage = msg;
break;
}
}
if (!lastAssistantMessage) return false;
const buttons = lastAssistantMessage.querySelectorAll(".version-nav-button");
for (let i = 0; i < buttons.length; i++) {
const button = buttons[i];
const onclick = button.getAttribute("onclick");
const disabled = button.hasAttribute("disabled");
const isLeft = onclick && onclick.includes("'left'");
const isRight = onclick && onclick.includes("'right'");
if (!disabled) {
if (direction === "left" && isLeft) {
navigateVersion(button, direction);
return true;
}
if (direction === "right" && isRight) {
navigateVersion(button, direction);
return true;
}
}
}
return false;
}

View file

@ -5,6 +5,7 @@ import html
import json
import pprint
import re
import time
from datetime import datetime
from functools import partial
from pathlib import Path
@ -30,12 +31,37 @@ from modules.text_generation import (
get_max_prompt_length
)
from modules.utils import delete_file, get_available_characters, save_file
from modules.web_search import add_web_search_attachments
def strftime_now(format):
return datetime.now().strftime(format)
def get_current_timestamp():
"""Returns the current time in 24-hour format"""
return datetime.now().strftime('%b %d, %Y %H:%M')
def update_message_metadata(metadata_dict, role, index, **fields):
"""
Updates or adds metadata fields for a specific message.
Args:
metadata_dict: The metadata dictionary
role: The role (user, assistant, etc)
index: The message index
**fields: Arbitrary metadata fields to update/add
"""
key = f"{role}_{index}"
if key not in metadata_dict:
metadata_dict[key] = {}
# Update with provided fields
for field_name, field_value in fields.items():
metadata_dict[key][field_name] = field_value
jinja_env = ImmutableSandboxedEnvironment(
trim_blocks=True,
lstrip_blocks=True,
@ -132,7 +158,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
impersonate = kwargs.get('impersonate', False)
_continue = kwargs.get('_continue', False)
also_return_rows = kwargs.get('also_return_rows', False)
history = kwargs.get('history', state['history'])['internal']
history_data = kwargs.get('history', state['history'])
history = history_data['internal']
metadata = history_data.get('metadata', {})
# Templates
chat_template_str = state['chat_template_str']
@ -145,7 +173,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
instruct_renderer = partial(
instruction_template.render,
builtin_tools=None,
tools=None,
tools=state['tools'] if 'tools' in state else None,
tools_in_user_message=False,
add_generation_prompt=False
)
@ -171,18 +199,62 @@ def generate_chat_prompt(user_input, state, **kwargs):
messages.append({"role": "system", "content": context})
insert_pos = len(messages)
for user_msg, assistant_msg in reversed(history):
user_msg = user_msg.strip()
assistant_msg = assistant_msg.strip()
for i, entry in enumerate(reversed(history)):
user_msg = entry[0].strip()
assistant_msg = entry[1].strip()
tool_msg = entry[2].strip() if len(entry) > 2 else ''
row_idx = len(history) - i - 1
if tool_msg:
messages.insert(insert_pos, {"role": "tool", "content": tool_msg})
if assistant_msg:
messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']:
messages.insert(insert_pos, {"role": "user", "content": user_msg})
# Check for user message attachments in metadata
user_key = f"user_{row_idx}"
enhanced_user_msg = user_msg
# Add attachment content if present
if user_key in metadata and "attachments" in metadata[user_key]:
attachments_text = ""
for attachment in metadata[user_key]["attachments"]:
filename = attachment.get("name", "file")
content = attachment.get("content", "")
attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
if attachments_text:
enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}"
messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg})
user_input = user_input.strip()
if user_input and not impersonate and not _continue:
# Check if we have attachments even with empty input
has_attachments = False
if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
current_row_idx = len(history)
user_key = f"user_{current_row_idx}"
has_attachments = user_key in metadata and "attachments" in metadata[user_key]
if (user_input or has_attachments) and not impersonate and not _continue:
# For the current user input being processed, check if we need to add attachments
if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
current_row_idx = len(history)
user_key = f"user_{current_row_idx}"
if user_key in metadata and "attachments" in metadata[user_key]:
attachments_text = ""
for attachment in metadata[user_key]["attachments"]:
filename = attachment.get("name", "file")
content = attachment.get("content", "")
attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
if attachments_text:
user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}"
messages.append({"role": "user", "content": user_input})
def make_prompt(messages):
@ -251,7 +323,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
# Resort to truncating the user input
else:
user_message = messages[-1]['content']
# Bisect the truncation point
@ -288,6 +359,50 @@ def generate_chat_prompt(user_input, state, **kwargs):
return prompt
def count_prompt_tokens(text_input, state):
"""Count tokens for current history + input including attachments"""
if shared.tokenizer is None:
return "Tokenizer not available"
try:
# Handle dict format with text and files
files = []
if isinstance(text_input, dict):
files = text_input.get('files', [])
text = text_input.get('text', '')
else:
text = text_input
files = []
# Create temporary history copy to add attachments
temp_history = copy.deepcopy(state['history'])
if 'metadata' not in temp_history:
temp_history['metadata'] = {}
# Process attachments if any
if files:
row_idx = len(temp_history['internal'])
for file_path in files:
add_message_attachment(temp_history, row_idx, file_path, is_user=True)
# Create temp state with modified history
temp_state = copy.deepcopy(state)
temp_state['history'] = temp_history
# Build prompt using existing logic
prompt = generate_chat_prompt(text, temp_state)
current_tokens = get_encoded_length(prompt)
max_tokens = temp_state['truncation_length']
percentage = (current_tokens / max_tokens) * 100 if max_tokens > 0 else 0
return f"History + Input:<br/>{current_tokens:,} / {max_tokens:,} tokens ({percentage:.1f}%)"
except Exception as e:
logger.error(f"Error counting tokens: {e}")
return f"Error: {str(e)}"
def get_stopping_strings(state):
stopping_strings = []
renderers = []
@ -336,6 +451,114 @@ def get_stopping_strings(state):
return result
def add_message_version(history, role, row_idx, is_current=True):
key = f"{role}_{row_idx}"
if 'metadata' not in history:
history['metadata'] = {}
if key not in history['metadata']:
history['metadata'][key] = {}
if "versions" not in history['metadata'][key]:
history['metadata'][key]["versions"] = []
# Determine which index to use for content based on role
content_idx = 0 if role == 'user' else 1
current_content = history['internal'][row_idx][content_idx]
current_visible = history['visible'][row_idx][content_idx]
history['metadata'][key]["versions"].append({
"content": current_content,
"visible_content": current_visible,
"timestamp": get_current_timestamp()
})
if is_current:
# Set the current_version_index to the newly added version (which is now the last one).
history['metadata'][key]["current_version_index"] = len(history['metadata'][key]["versions"]) - 1
def add_message_attachment(history, row_idx, file_path, is_user=True):
"""Add a file attachment to a message in history metadata"""
if 'metadata' not in history:
history['metadata'] = {}
key = f"{'user' if is_user else 'assistant'}_{row_idx}"
if key not in history['metadata']:
history['metadata'][key] = {"timestamp": get_current_timestamp()}
if "attachments" not in history['metadata'][key]:
history['metadata'][key]["attachments"] = []
# Get file info using pathlib
path = Path(file_path)
filename = path.name
file_extension = path.suffix.lower()
try:
# Handle different file types
if file_extension == '.pdf':
# Process PDF file
content = extract_pdf_text(path)
file_type = "application/pdf"
else:
# Default handling for text files
with open(path, 'r', encoding='utf-8') as f:
content = f.read()
file_type = "text/plain"
# Add attachment
attachment = {
"name": filename,
"type": file_type,
"content": content,
}
history['metadata'][key]["attachments"].append(attachment)
return content # Return the content for reuse
except Exception as e:
logger.error(f"Error processing attachment {filename}: {e}")
return None
def extract_pdf_text(pdf_path):
"""Extract text from a PDF file"""
import PyPDF2
text = ""
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n\n"
return text.strip()
except Exception as e:
logger.error(f"Error extracting text from PDF: {e}")
return f"[Error extracting PDF text: {str(e)}]"
def generate_search_query(user_message, state):
"""Generate a search query from user message using the LLM"""
# Augment the user message with search instruction
augmented_message = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else."
# Use a minimal state for search query generation but keep the full history
search_state = state.copy()
search_state['max_new_tokens'] = 64
search_state['auto_max_new_tokens'] = False
search_state['enable_thinking'] = False
# Generate the full prompt using existing history + augmented message
formatted_prompt = generate_chat_prompt(augmented_message, search_state)
query = ""
for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True):
query = reply.strip()
return query
def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False):
# Handle dict format with text and files
files = []
@ -509,16 +732,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
yield output
def impersonate_wrapper(text, state):
def impersonate_wrapper(textbox, state):
text = textbox['text']
static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
prompt = generate_chat_prompt('', state, impersonate=True)
stopping_strings = get_stopping_strings(state)
yield text + '...', static_output
textbox['text'] = text + '...'
yield textbox, static_output
reply = None
for reply in generate_reply(prompt + text, state, stopping_strings=stopping_strings, is_chat=True):
yield (text + reply).lstrip(' '), static_output
textbox['text'] = (text + reply).lstrip(' ')
yield textbox, static_output
if shared.stop_everything:
return
@ -564,56 +790,81 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
send_dummy_reply(state['start_with'], state)
history = state['history']
last_save_time = time.monotonic()
save_interval = 8
for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)):
yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
current_time = time.monotonic()
# Save on first iteration or if save_interval seconds have passed
if i == 0 or (current_time - last_save_time) >= save_interval:
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
last_save_time = current_time
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
def remove_last_message(history):
if 'metadata' not in history:
history['metadata'] = {}
if len(history['visible']) > 0 and history['internal'][-1][0] != '<|BEGIN-VISIBLE-CHAT|>':
row_idx = len(history['internal']) - 1
last = history['visible'].pop()
history['internal'].pop()
# Remove metadata directly by known keys
if f"user_{row_idx}" in history['metadata']:
del history['metadata'][f"user_{row_idx}"]
if f"assistant_{row_idx}" in history['metadata']:
del history['metadata'][f"assistant_{row_idx}"]
else:
last = ['', '']
return html.unescape(last[0]), history
def send_last_reply_to_input(history):
if len(history['visible']) > 0:
return html.unescape(history['visible'][-1][1])
else:
return ''
def replace_last_reply(text, state):
history = state['history']
if len(text.strip()) == 0:
return history
elif len(history['visible']) > 0:
history['visible'][-1][1] = html.escape(text)
history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
return history
def send_dummy_message(text, state):
history = state['history']
# Handle both dict and string inputs
if isinstance(text, dict):
text = text['text']
# Initialize metadata if not present
if 'metadata' not in history:
history['metadata'] = {}
row_idx = len(history['internal'])
history['visible'].append([html.escape(text), ''])
history['internal'].append([apply_extensions('input', text, state, is_chat=True), ''])
update_message_metadata(history['metadata'], "user", row_idx, timestamp=get_current_timestamp())
return history
def send_dummy_reply(text, state):
history = state['history']
# Handle both dict and string inputs
if isinstance(text, dict):
text = text['text']
# Initialize metadata if not present
if 'metadata' not in history:
history['metadata'] = {}
if len(history['visible']) > 0 and not history['visible'][-1][1] == '':
row_idx = len(history['internal'])
history['visible'].append(['', ''])
history['internal'].append(['', ''])
# We don't need to add system metadata
row_idx = len(history['internal']) - 1
history['visible'][-1][1] = html.escape(text)
history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
update_message_metadata(history['metadata'], "assistant", row_idx, timestamp=get_current_timestamp())
return history
@ -623,7 +874,8 @@ def redraw_html(history, name1, name2, mode, style, character, reset_cache=False
def start_new_chat(state):
mode = state['mode']
history = {'internal': [], 'visible': []}
# Initialize with empty metadata dictionary
history = {'internal': [], 'visible': [], 'metadata': {}}
if mode != 'instruct':
greeting = replace_character_names(state['greeting'], state['name1'], state['name2'])
@ -631,6 +883,9 @@ def start_new_chat(state):
history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]]
history['visible'] += [['', apply_extensions('output', html.escape(greeting), state, is_chat=True)]]
# Add timestamp for assistant's greeting
update_message_metadata(history['metadata'], "assistant", 0, timestamp=get_current_timestamp())
unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
save_history(history, unique_id, state['character_menu'], state['mode'])
@ -811,6 +1066,16 @@ def load_history(unique_id, character, mode):
'visible': f['data_visible']
}
# Add metadata if it doesn't exist
if 'metadata' not in history:
history['metadata'] = {}
# Add placeholder timestamps for existing messages
for i, (user_msg, asst_msg) in enumerate(history['internal']):
if user_msg and user_msg != '<|BEGIN-VISIBLE-CHAT|>':
update_message_metadata(history['metadata'], "user", i, timestamp="")
if asst_msg:
update_message_metadata(history['metadata'], "assistant", i, timestamp="")
return history
@ -826,6 +1091,16 @@ def load_history_json(file, history):
'visible': f['data_visible']
}
# Add metadata if it doesn't exist
if 'metadata' not in history:
history['metadata'] = {}
# Add placeholder timestamps
for i, (user_msg, asst_msg) in enumerate(history['internal']):
if user_msg and user_msg != '<|BEGIN-VISIBLE-CHAT|>':
update_message_metadata(history['metadata'], "user", i, timestamp="")
if asst_msg:
update_message_metadata(history['metadata'], "assistant", i, timestamp="")
return history
except:
return history
@ -1147,20 +1422,12 @@ def my_yaml_output(data):
return result
def handle_replace_last_reply_click(text, state):
history = replace_last_reply(text, state)
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html, ""]
def handle_send_dummy_message_click(text, state):
history = send_dummy_message(text, state)
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html, ""]
return [history, html, {"text": "", "files": []}]
def handle_send_dummy_reply_click(text, state):
@ -1168,7 +1435,7 @@ def handle_send_dummy_reply_click(text, state):
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html, ""]
return [history, html, {"text": "", "files": []}]
def handle_remove_last_click(state):
@ -1176,7 +1443,7 @@ def handle_remove_last_click(state):
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html, last_input]
return [history, html, {"text": last_input, "files": []}]
def handle_unique_id_select(state):
@ -1222,7 +1489,13 @@ def handle_delete_chat_confirm_click(state):
def handle_branch_chat_click(state):
branch_from_index = state['branch_index']
if branch_from_index == -1:
history = state['history']
else:
history = state['history']
history['visible'] = history['visible'][:branch_from_index + 1]
history['internal'] = history['internal'][:branch_from_index + 1]
new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
save_history(history, new_unique_id, state['character_menu'], state['mode'])
@ -1233,7 +1506,93 @@ def handle_branch_chat_click(state):
past_chats_update = gr.update(choices=histories, value=new_unique_id)
return [history, html, past_chats_update]
return [history, html, past_chats_update, -1]
def handle_edit_message_click(state):
history = state['history']
message_index = int(state['edit_message_index'])
new_text = state['edit_message_text']
role = state['edit_message_role'] # "user" or "assistant"
if message_index >= len(history['internal']):
html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html_output]
role_idx = 0 if role == "user" else 1
if 'metadata' not in history:
history['metadata'] = {}
key = f"{role}_{message_index}"
if key not in history['metadata']:
history['metadata'][key] = {}
# If no versions exist yet for this message, store the current (pre-edit) content as the first version.
if "versions" not in history['metadata'][key] or not history['metadata'][key]["versions"]:
original_content = history['internal'][message_index][role_idx]
original_visible = history['visible'][message_index][role_idx]
original_timestamp = history['metadata'][key].get('timestamp', get_current_timestamp())
history['metadata'][key]["versions"] = [{
"content": original_content,
"visible_content": original_visible,
"timestamp": original_timestamp
}]
history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True)
history['visible'][message_index][role_idx] = html.escape(new_text)
add_message_version(history, role, message_index, is_current=True)
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html_output]
def handle_navigate_version_click(state):
history = state['history']
message_index = int(state['navigate_message_index'])
direction = state['navigate_direction']
role = state['navigate_message_role']
if not role:
logger.error("Role not provided for version navigation.")
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html]
key = f"{role}_{message_index}"
if 'metadata' not in history or key not in history['metadata'] or 'versions' not in history['metadata'][key]:
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html]
metadata = history['metadata'][key]
versions = metadata['versions']
# Default to the last version if current_version_index is not set
current_idx = metadata.get('current_version_index', len(versions) - 1 if versions else 0)
if direction == 'left':
new_idx = max(0, current_idx - 1)
else: # right
new_idx = min(len(versions) - 1, current_idx + 1)
if new_idx == current_idx:
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html]
msg_content_idx = 0 if role == 'user' else 1 # 0 for user content, 1 for assistant content in the pair
version_to_load = versions[new_idx]
history['internal'][message_index][msg_content_idx] = version_to_load['content']
history['visible'][message_index][msg_content_idx] = version_to_load['visible_content']
metadata['current_version_index'] = new_idx
update_message_metadata(history['metadata'], role, message_index, timestamp=version_to_load['timestamp'])
# Redraw and save
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
return [history, html]
def handle_rename_chat_click():
@ -1375,7 +1734,7 @@ def handle_your_picture_change(picture, state):
def handle_send_instruction_click(state):
state['mode'] = 'instruct'
state['history'] = {'internal': [], 'visible': []}
state['history'] = {'internal': [], 'visible': [], 'metadata': {}}
output = generate_chat_prompt("Input", state)

View file

@ -119,7 +119,7 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
reset = True
# Maximum number of tokens to process in a single forward pass
max_chunk_size = 2048
max_chunk_size = 256
# Make the forward call
if labels is None:
@ -245,3 +245,20 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
return Exllamav3HF(pretrained_model_name_or_path)
def unload(self):
"""Properly unload the ExllamaV3 model and free GPU memory."""
if hasattr(self, 'ex_model') and self.ex_model is not None:
self.ex_model.unload()
self.ex_model = None
if hasattr(self, 'ex_cache') and self.ex_cache is not None:
self.ex_cache = None
# Clean up any additional ExllamaV3 resources
if hasattr(self, 'past_seq'):
self.past_seq = None
if hasattr(self, 'past_seq_negative'):
self.past_seq_negative = None
if hasattr(self, 'ex_cache_negative'):
self.ex_cache_negative = None

View file

@ -169,11 +169,7 @@ def convert_to_markdown(string, message_id=None):
thinking_block = f'''
<details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
<summary class="thinking-header">
<svg class="thinking-icon" width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
<path d="M8 1.33334C4.31868 1.33334 1.33334 4.31868 1.33334 8.00001C1.33334 11.6813 4.31868 14.6667 8 14.6667C11.6813 14.6667 14.6667 11.6813 14.6667 8.00001C14.6667 4.31868 11.6813 1.33334 8 1.33334Z" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
<path d="M8 10.6667V8.00001" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
<path d="M8 5.33334H8.00667" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
</svg>
{info_svg_small}
<span class="thinking-title">{title_text}</span>
</summary>
<div class="thinking-content pretty_scrollbar">{thinking_html}</div>
@ -339,41 +335,164 @@ copy_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" vie
refresh_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-repeat"><path d="M4 12v-3a3 3 0 0 1 3 -3h13m-3 -3l3 3l-3 3"></path><path d="M20 12v3a3 3 0 0 1 -3 3h-13m3 3l-3 -3l3 -3"></path></svg>'''
continue_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon icon-tabler icons-tabler-outline icon-tabler-player-play"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M7 4v16l13 -8z" /></svg>'''
remove_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon icon-tabler icons-tabler-outline icon-tabler-trash"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M4 7l16 0" /><path d="M10 11l0 6" /><path d="M14 11l0 6" /><path d="M5 7l1 12a2 2 0 0 0 2 2h8a2 2 0 0 0 2 -2l1 -12" /><path d="M9 7v-3a1 1 0 0 1 1 -1h4a1 1 0 0 1 1 1v3" /></svg>'''
branch_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon icon-tabler icons-tabler-outline icon-tabler-git-branch"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M7 18m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M7 6m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M17 6m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M7 8l0 8" /><path d="M9 18h6a2 2 0 0 0 2 -2v-5" /><path d="M14 14l3 -3l3 3" /></svg>'''
edit_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-pencil"><path d="M4 20h4l10.5 -10.5a2.828 2.828 0 1 0 -4 -4l-10.5 10.5v4"></path><path d="M13.5 6.5l4 4"></path></svg>'''
info_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
info_svg_small = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
attachment_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21.44 11.05l-9.19 9.19a6 6 0 0 1-8.48-8.48l9.19-9.19a4 4 0 0 1 5.66 5.66l-9.2 9.19a2 2 0 0 1-2.83-2.83l8.49-8.48"></path></svg>'''
copy_button = f'<button class="footer-button footer-copy-button" title="Copy" onclick="copyToClipboard(this)">{copy_svg}</button>'
branch_button = f'<button class="footer-button footer-branch-button" title="Branch here" onclick="branchHere(this)">{branch_svg}</button>'
edit_button = f'<button class="footer-button footer-edit-button" title="Edit" onclick="editHere(this)">{edit_svg}</button>'
refresh_button = f'<button class="footer-button footer-refresh-button" title="Regenerate" onclick="regenerateClick()">{refresh_svg}</button>'
continue_button = f'<button class="footer-button footer-continue-button" title="Continue" onclick="continueClick()">{continue_svg}</button>'
remove_button = f'<button class="footer-button footer-remove-button" title="Remove last reply" onclick="removeLastClick()">{remove_svg}</button>'
info_button = f'<button class="footer-button footer-info-button" title="message">{info_svg}</button>'
def format_message_timestamp(history, role, index):
"""Get a formatted timestamp HTML span for a message if available"""
key = f"{role}_{index}"
if 'metadata' in history and key in history['metadata'] and history['metadata'][key].get('timestamp'):
timestamp = history['metadata'][key]['timestamp']
return f"<span class='timestamp'>{timestamp}</span>"
return ""
def format_message_attachments(history, role, index):
"""Get formatted HTML for message attachments if available"""
key = f"{role}_{index}"
if 'metadata' in history and key in history['metadata'] and 'attachments' in history['metadata'][key]:
attachments = history['metadata'][key]['attachments']
if not attachments:
return ""
attachments_html = '<div class="message-attachments">'
for attachment in attachments:
name = html.escape(attachment["name"])
# Make clickable if URL exists
if "url" in attachment:
name = f'<a href="{html.escape(attachment["url"])}" target="_blank" rel="noopener noreferrer">{name}</a>'
attachments_html += (
f'<div class="attachment-box">'
f'<div class="attachment-icon">{attachment_svg}</div>'
f'<div class="attachment-name">{name}</div>'
f'</div>'
)
attachments_html += '</div>'
return attachments_html
return ""
def get_version_navigation_html(history, i, role):
"""Generate simple navigation arrows for message versions"""
key = f"{role}_{i}"
metadata = history.get('metadata', {})
if key not in metadata or 'versions' not in metadata[key]:
return ""
versions = metadata[key]['versions']
# Default to the last version if current_version_index isn't set in metadata
current_idx = metadata[key].get('current_version_index', len(versions) - 1 if versions else 0)
if len(versions) <= 1:
return ""
left_disabled = ' disabled' if current_idx == 0 else ''
right_disabled = ' disabled' if current_idx >= len(versions) - 1 else ''
left_arrow = f'<button class="footer-button version-nav-button"{left_disabled} onclick="navigateVersion(this, \'left\')" title="Previous version">&lt;</button>'
right_arrow = f'<button class="footer-button version-nav-button"{right_disabled} onclick="navigateVersion(this, \'right\')" title="Next version">&gt;</button>'
position = f'<span class="version-position">{current_idx + 1}/{len(versions)}</span>'
return f'<div class="version-navigation">{left_arrow}{position}{right_arrow}</div>'
def actions_html(history, i, role, info_message=""):
action_buttons = ""
version_nav_html = ""
if role == "assistant":
action_buttons = (
f'{copy_button}'
f'{edit_button}'
f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
f'{continue_button if i == len(history["visible"]) - 1 else ""}'
f'{remove_button if i == len(history["visible"]) - 1 else ""}'
f'{branch_button}'
)
version_nav_html = get_version_navigation_html(history, i, "assistant")
elif role == "user":
action_buttons = (
f'{copy_button}'
f'{edit_button}'
)
version_nav_html = get_version_navigation_html(history, i, "user")
return (f'<div class="message-actions">'
f'{action_buttons}'
f'{info_message}'
f'</div>'
f'{version_nav_html}')
def generate_instruct_html(history):
output = f'<style>{instruct_css}</style><div class="chat" id="chat"><div class="messages">'
output = f'<style>{instruct_css}</style><div class="chat" id="chat" data-mode="instruct"><div class="messages">'
for i in range(len(history['visible'])):
row_visible = history['visible'][i]
row_internal = history['internal'][i]
converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
# Get timestamps
user_timestamp = format_message_timestamp(history, "user", i)
assistant_timestamp = format_message_timestamp(history, "assistant", i)
# Get attachments
user_attachments = format_message_attachments(history, "user", i)
assistant_attachments = format_message_attachments(history, "assistant", i)
# Create info buttons for timestamps if they exist
info_message_user = ""
if user_timestamp != "":
# Extract the timestamp value from the span
user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0]
info_message_user = info_button.replace("message", user_timestamp_value)
info_message_assistant = ""
if assistant_timestamp != "":
# Extract the timestamp value from the span
assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0]
info_message_assistant = info_button.replace("message", assistant_timestamp_value)
if converted_visible[0]: # Don't display empty user messages
output += (
f'<div class="user-message" '
f'data-raw="{html.escape(row_internal[0], quote=True)}">'
f'data-raw="{html.escape(row_internal[0], quote=True)}"'
f'data-index={i}>'
f'<div class="text">'
f'<div class="message-body">{converted_visible[0]}</div>'
f'{copy_button}'
f'{user_attachments}'
f'{actions_html(history, i, "user", info_message_user)}'
f'</div>'
f'</div>'
)
output += (
f'<div class="assistant-message" '
f'data-raw="{html.escape(row_internal[1], quote=True)}">'
f'data-raw="{html.escape(row_internal[1], quote=True)}"'
f'data-index={i}>'
f'<div class="text">'
f'<div class="message-body">{converted_visible[1]}</div>'
f'{copy_button}'
f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
f'{continue_button if i == len(history["visible"]) - 1 else ""}'
f'{remove_button if i == len(history["visible"]) - 1 else ""}'
f'{assistant_attachments}'
f'{actions_html(history, i, "assistant", info_message_assistant)}'
f'</div>'
f'</div>'
)
@ -401,30 +520,39 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
row_internal = history['internal'][i]
converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
# Get timestamps
user_timestamp = format_message_timestamp(history, "user", i)
assistant_timestamp = format_message_timestamp(history, "assistant", i)
# Get attachments
user_attachments = format_message_attachments(history, "user", i)
assistant_attachments = format_message_attachments(history, "assistant", i)
if converted_visible[0]: # Don't display empty user messages
output += (
f'<div class="message" '
f'data-raw="{html.escape(row_internal[0], quote=True)}">'
f'data-raw="{html.escape(row_internal[0], quote=True)}"'
f'data-index={i}>'
f'<div class="circle-you">{img_me}</div>'
f'<div class="text">'
f'<div class="username">{name1}</div>'
f'<div class="username">{name1}{user_timestamp}</div>'
f'<div class="message-body">{converted_visible[0]}</div>'
f'{copy_button}'
f'{user_attachments}'
f'{actions_html(history, i, "user")}'
f'</div>'
f'</div>'
)
output += (
f'<div class="message" '
f'data-raw="{html.escape(row_internal[1], quote=True)}">'
f'data-raw="{html.escape(row_internal[1], quote=True)}"'
f'data-index={i}>'
f'<div class="circle-bot">{img_bot}</div>'
f'<div class="text">'
f'<div class="username">{name2}</div>'
f'<div class="username">{name2}{assistant_timestamp}</div>'
f'<div class="message-body">{converted_visible[1]}</div>'
f'{copy_button}'
f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
f'{continue_button if i == len(history["visible"]) - 1 else ""}'
f'{remove_button if i == len(history["visible"]) - 1 else ""}'
f'{assistant_attachments}'
f'{actions_html(history, i, "assistant")}'
f'</div>'
f'</div>'
)
@ -441,26 +569,48 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
row_internal = history['internal'][i]
converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
# Get timestamps
user_timestamp = format_message_timestamp(history, "user", i)
assistant_timestamp = format_message_timestamp(history, "assistant", i)
# Get attachments
user_attachments = format_message_attachments(history, "user", i)
assistant_attachments = format_message_attachments(history, "assistant", i)
# Create info buttons for timestamps if they exist
info_message_user = ""
if user_timestamp != "":
# Extract the timestamp value from the span
user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0]
info_message_user = info_button.replace("message", user_timestamp_value)
info_message_assistant = ""
if assistant_timestamp != "":
# Extract the timestamp value from the span
assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0]
info_message_assistant = info_button.replace("message", assistant_timestamp_value)
if converted_visible[0]: # Don't display empty user messages
output += (
f'<div class="message" '
f'data-raw="{html.escape(row_internal[0], quote=True)}">'
f'data-raw="{html.escape(row_internal[0], quote=True)}"'
f'data-index={i}>'
f'<div class="text-you">'
f'<div class="message-body">{converted_visible[0]}</div>'
f'{copy_button}'
f'{user_attachments}'
f'{actions_html(history, i, "user", info_message_user)}'
f'</div>'
f'</div>'
)
output += (
f'<div class="message" '
f'data-raw="{html.escape(row_internal[1], quote=True)}">'
f'data-raw="{html.escape(row_internal[1], quote=True)}"'
f'data-index={i}>'
f'<div class="text-bot">'
f'<div class="message-body">{converted_visible[1]}</div>'
f'{copy_button}'
f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
f'{continue_button if i == len(history["visible"]) - 1 else ""}'
f'{remove_button if i == len(history["visible"]) - 1 else ""}'
f'{assistant_attachments}'
f'{actions_html(history, i, "assistant", info_message_assistant)}'
f'</div>'
f'</div>'
)

View file

@ -66,7 +66,7 @@ class LlamaServer:
"top_k": state["top_k"],
"top_p": state["top_p"],
"min_p": state["min_p"],
"tfs_z": state["tfs"],
"top_n_sigma": state["top_n_sigma"] if state["top_n_sigma"] > 0 else -1,
"typical_p": state["typical_p"],
"repeat_penalty": state["repetition_penalty"],
"repeat_last_n": state["repetition_penalty_range"],
@ -102,8 +102,10 @@ class LlamaServer:
penalty_found = False
for s in samplers:
if s.strip() in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]:
if s.strip() in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc"]:
filtered_samplers.append(s.strip())
elif s.strip() == "typical_p":
filtered_samplers.append("typ_p")
elif not penalty_found and s.strip() == "repetition_penalty":
filtered_samplers.append("penalties")
penalty_found = True
@ -144,8 +146,9 @@ class LlamaServer:
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
print()
# Make a direct request with streaming enabled using a context manager
with self.session.post(url, json=payload, stream=True) as response:
# Make the generation request
response = self.session.post(url, json=payload, stream=True)
try:
response.raise_for_status() # Raise an exception for HTTP errors
full_text = ""
@ -182,6 +185,8 @@ class LlamaServer:
print(f"JSON decode error: {e}")
print(f"Problematic line: {line}")
continue
finally:
response.close()
def generate(self, prompt, state):
output = ""
@ -210,6 +215,7 @@ class LlamaServer:
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
print()
for retry in range(5):
response = self.session.post(url, json=payload)
result = response.json()
@ -255,9 +261,10 @@ class LlamaServer:
self.server_path,
"--model", self.model_path,
"--ctx-size", str(shared.args.ctx_size),
"--n-gpu-layers", str(shared.args.n_gpu_layers),
"--gpu-layers", str(shared.args.gpu_layers),
"--batch-size", str(shared.args.batch_size),
"--port", str(self.port),
"--no-webui",
]
if shared.args.flash_attn:
@ -278,8 +285,10 @@ class LlamaServer:
cmd.append("--no-kv-offload")
if shared.args.row_split:
cmd += ["--split-mode", "row"]
cache_type = "fp16"
if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types:
cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type]
cache_type = shared.args.cache_type
if shared.args.compress_pos_emb != 1:
cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
if shared.args.rope_freq_base > 0:
@ -316,7 +325,13 @@ class LlamaServer:
for flag_item in extra_flags.split(','):
if '=' in flag_item:
flag, value = flag_item.split('=', 1)
if len(flag) <= 3:
cmd += [f"-{flag}", value]
else:
cmd += [f"--{flag}", value]
else:
if len(flag_item) <= 3:
cmd.append(f"-{flag_item}")
else:
cmd.append(f"--{flag_item}")
@ -333,6 +348,7 @@ class LlamaServer:
print(' '.join(str(item) for item in cmd[1:]))
print()
logger.info(f"Using gpu_layers={shared.args.gpu_layers} | ctx_size={shared.args.ctx_size} | cache_type={cache_type}")
# Start the server with pipes for output
self.process = subprocess.Popen(
cmd,

View file

@ -5,7 +5,7 @@ import gradio as gr
loaders_and_params = OrderedDict({
'llama.cpp': [
'n_gpu_layers',
'gpu_layers',
'threads',
'threads_batch',
'batch_size',
@ -28,6 +28,7 @@ loaders_and_params = OrderedDict({
'device_draft',
'ctx_size_draft',
'speculative_decoding_accordion',
'vram_info',
],
'Transformers': [
'gpu_split',
@ -84,17 +85,11 @@ loaders_and_params = OrderedDict({
'no_flash_attn',
'no_xformers',
'no_sdpa',
'exllamav2_info',
'model_draft',
'draft_max',
'ctx_size_draft',
'speculative_decoding_accordion',
],
'HQQ': [
'hqq_backend',
'trust_remote_code',
'no_use_fast',
],
'TensorRT-LLM': [
'ctx_size',
'cpp_runner',
@ -158,7 +153,6 @@ def transformers_samplers():
loaders_samplers = {
'Transformers': transformers_samplers(),
'HQQ': transformers_samplers(),
'ExLlamav3_HF': {
'temperature',
'dynatemp_low',
@ -299,7 +293,7 @@ loaders_samplers = {
'typical_p',
'xtc_threshold',
'xtc_probability',
'tfs',
'top_n_sigma',
'dry_multiplier',
'dry_allowed_length',
'dry_base',

View file

@ -7,6 +7,7 @@ from modules import models, shared
from modules.logging_colors import logger
from modules.models import load_model
from modules.text_generation import generate_reply
from modules.utils import check_model_loaded
global_scores = None
@ -33,9 +34,9 @@ def get_next_logits(*args, **kwargs):
def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False):
if shared.model is None:
logger.error("No model is loaded! Select one in the Model tab.")
return 'Error: No model is loaded1 Select one in the Model tab.', previous
model_is_loaded, error_message = check_model_loaded()
if not model_is_loaded:
return error_message, previous
# llama.cpp case
if shared.model.__class__.__name__ == 'LlamaServer':

View file

@ -21,7 +21,6 @@ def load_model(model_name, loader=None):
'ExLlamav3_HF': ExLlamav3_HF_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader,
'ExLlamav2': ExLlamav2_loader,
'HQQ': HQQ_loader,
'TensorRT-LLM': TensorRT_LLM_loader,
}
@ -71,7 +70,6 @@ def llama_cpp_server_loader(model_name):
else:
model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
logger.info(f"llama.cpp weights detected: \"{model_file}\"")
try:
model = LlamaServer(model_file)
return model, model
@ -103,21 +101,6 @@ def ExLlamav2_loader(model_name):
return model, tokenizer
def HQQ_loader(model_name):
try:
from hqq.core.quantize import HQQBackend, HQQLinear
from hqq.models.hf.base import AutoHQQHFModel
except ModuleNotFoundError:
raise ModuleNotFoundError("Failed to import 'hqq'. Please install it manually following the instructions in the HQQ GitHub repository.")
logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"")
model_dir = Path(f'{shared.args.model_dir}/{model_name}')
model = AutoHQQHFModel.from_quantized(str(model_dir))
HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend))
return model
def TensorRT_LLM_loader(model_name):
try:
from modules.tensorrt_llm import TensorRTLLMModel
@ -133,10 +116,13 @@ def unload_model(keep_model_name=False):
return
is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
if shared.args.loader == 'ExLlamav3_HF':
shared.model.unload()
shared.model = shared.tokenizer = None
shared.lora_names = []
shared.model_dirty_from_training = False
if not is_llamacpp:
from modules.torch_utils import clear_torch_cache
clear_torch_cache()

View file

@ -1,7 +1,11 @@
import functools
import json
import re
import subprocess
from math import floor
from pathlib import Path
import gradio as gr
import yaml
from modules import chat, loaders, metadata_gguf, shared, ui
@ -54,7 +58,7 @@ def get_model_metadata(model):
else:
model_file = list(path.glob('*.gguf'))[0]
metadata = metadata_gguf.load_metadata(model_file)
metadata = load_gguf_metadata_with_cache(model_file)
for k in metadata:
if k.endswith('context_length'):
@ -67,7 +71,8 @@ def get_model_metadata(model):
elif k.endswith('rope.scaling.factor'):
model_settings['compress_pos_emb'] = metadata[k]
elif k.endswith('block_count'):
model_settings['n_gpu_layers'] = metadata[k] + 1
model_settings['gpu_layers'] = metadata[k] + 1
model_settings['max_gpu_layers'] = metadata[k] + 1
if 'tokenizer.chat_template' in metadata:
template = metadata['tokenizer.chat_template']
@ -149,7 +154,11 @@ def get_model_metadata(model):
for pat in settings:
if re.match(pat.lower(), Path(model).name.lower()):
for k in settings[pat]:
model_settings[k] = settings[pat][k]
new_k = k
if k == 'n_gpu_layers':
new_k = 'gpu_layers'
model_settings[new_k] = settings[pat][k]
# Load instruction template if defined by name rather than by value
if model_settings['instruction_template'] != 'Custom (obtained from model metadata)':
@ -174,8 +183,6 @@ def infer_loader(model_name, model_settings, hf_quant_method=None):
loader = 'ExLlamav3_HF'
elif re.match(r'.*exl2', model_name.lower()):
loader = 'ExLlamav2_HF'
elif re.match(r'.*-hqq', model_name.lower()):
return 'HQQ'
else:
loader = 'Transformers'
@ -209,15 +216,27 @@ def apply_model_settings_to_state(model, state):
model_settings = get_model_metadata(model)
if 'loader' in model_settings:
loader = model_settings.pop('loader')
# If the user is using an alternative loader for the same model type, let them keep using it
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
state['loader'] = loader
for k in model_settings:
if k in state:
if k in state and k != 'gpu_layers': # Skip gpu_layers, handle separately
state[k] = model_settings[k]
# Handle GPU layers and VRAM update for llama.cpp
if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings:
vram_info, gpu_layers_update = update_gpu_layers_and_vram(
state['loader'],
model,
model_settings['gpu_layers'],
state['ctx_size'],
state['cache_type'],
auto_adjust=True
)
state['gpu_layers'] = gpu_layers_update
state['vram_info'] = vram_info
return state
@ -277,3 +296,197 @@ def save_instruction_template(model, template):
yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.")
else:
yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")
@functools.lru_cache(maxsize=1)
def load_gguf_metadata_with_cache(model_file):
return metadata_gguf.load_metadata(model_file)
def get_model_size_mb(model_file: Path) -> float:
filename = model_file.name
# Check for multipart pattern
match = re.match(r'(.+)-\d+-of-\d+\.gguf$', filename)
if match:
# It's a multipart file, find all matching parts
base_pattern = match.group(1)
part_files = sorted(model_file.parent.glob(f'{base_pattern}-*-of-*.gguf'))
total_size = sum(p.stat().st_size for p in part_files)
else:
# Single part
total_size = model_file.stat().st_size
return total_size / (1024 ** 2) # Return size in MB
def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
model_file = Path(f'{shared.args.model_dir}/{gguf_file}')
metadata = load_gguf_metadata_with_cache(model_file)
size_in_mb = get_model_size_mb(model_file)
# Extract values from metadata
n_layers = None
n_kv_heads = None
embedding_dim = None
for key, value in metadata.items():
if key.endswith('.block_count'):
n_layers = value
elif key.endswith('.attention.head_count_kv'):
n_kv_heads = max(value) if isinstance(value, list) else value
elif key.endswith('.embedding_length'):
embedding_dim = value
if gpu_layers > n_layers:
gpu_layers = n_layers
# Convert cache_type to numeric
if cache_type == 'q4_0':
cache_type = 4
elif cache_type == 'q8_0':
cache_type = 8
else:
cache_type = 16
# Derived features
size_per_layer = size_in_mb / max(n_layers, 1e-6)
kv_cache_factor = n_kv_heads * cache_type * ctx_size
embedding_per_context = embedding_dim / ctx_size
# Calculate VRAM using the model
# Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
vram = (
(size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor)
* (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632)))
+ 1516.522943869404
)
return vram
def get_nvidia_vram(return_free=True):
"""
Calculates VRAM statistics across all NVIDIA GPUs by parsing nvidia-smi output.
Args:
return_free (bool): If True, returns free VRAM. If False, returns total VRAM.
Returns:
int: Either the total free VRAM or total VRAM in MiB summed across all detected NVIDIA GPUs.
Returns -1 if nvidia-smi command fails (not found, error, etc.).
Returns 0 if nvidia-smi succeeds but no GPU memory info found.
"""
try:
# Execute nvidia-smi command
result = subprocess.run(
['nvidia-smi'],
capture_output=True,
text=True,
check=False
)
# Check if nvidia-smi returned an error
if result.returncode != 0:
return -1
# Parse the output for memory usage patterns
output = result.stdout
# Find memory usage like "XXXXMiB / YYYYMiB"
# Captures used and total memory for each GPU
matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output)
if not matches:
# No GPUs found in expected format
return 0
total_vram_mib = 0
total_free_vram_mib = 0
for used_mem_str, total_mem_str in matches:
try:
used_mib = int(used_mem_str)
total_mib = int(total_mem_str)
total_vram_mib += total_mib
total_free_vram_mib += (total_mib - used_mib)
except ValueError:
# Skip malformed entries
pass
# Return either free or total VRAM based on the flag
return total_free_vram_mib if return_free else total_vram_mib
except FileNotFoundError:
# nvidia-smi not found (likely no NVIDIA drivers installed)
return -1
except Exception:
# Handle any other unexpected exceptions
return -1
def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True):
"""
Unified function to handle GPU layers and VRAM updates.
Args:
for_ui: If True, returns Gradio updates. If False, returns raw values.
Returns:
- If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update
- If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage
"""
if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"):
vram_info = "<div id=\"vram-info\"'>Estimated VRAM to load the model:</div>"
if for_ui:
return (vram_info, gr.update()) if auto_adjust else vram_info
else:
return (0, gpu_layers) if auto_adjust else 0
current_layers = gpu_layers
max_layers = gpu_layers
if auto_adjust:
# Get model settings including user preferences
model_settings = get_model_metadata(model)
# Get the true maximum layers
max_layers = model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', gpu_layers))
# Check if this is a user-saved setting
user_config = shared.user_config
model_regex = Path(model).name + '$'
has_user_setting = model_regex in user_config and 'gpu_layers' in user_config[model_regex]
if has_user_setting:
# For user settings, just use the current value (which already has user pref)
# but ensure the slider maximum is correct
current_layers = gpu_layers # Already has user setting
else:
# No user setting, auto-adjust from the maximum
current_layers = max_layers # Start from max
# Auto-adjust based on available/total VRAM
# If a model is loaded and it's for the UI, use the total VRAM to avoid confusion
return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True
available_vram = get_nvidia_vram(return_free=return_free)
if available_vram > 0:
tolerance = 577
while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
current_layers -= 1
# Calculate VRAM with current layers
vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type)
if for_ui:
vram_info = f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{vram_usage:.0f} MiB</span></div>"
if auto_adjust:
return vram_info, gr.update(value=current_layers, maximum=max_layers)
else:
return vram_info
else:
if auto_adjust:
return vram_usage, current_layers
else:
return vram_usage

View file

@ -11,7 +11,7 @@ from modules.logging_colors import logger
def default_preset():
return {
result = {
'temperature': 1,
'dynatemp_low': 1,
'dynatemp_high': 1,
@ -46,10 +46,17 @@ def default_preset():
'do_sample': True,
'dynamic_temperature': False,
'temperature_last': False,
'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_n_sigma\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntop_n_sigma\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
}
if shared.args.portable:
samplers = result['sampler_priority'].split('\n')
samplers = [sampler for sampler in samplers if sampler in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc", "typical_p", "repetition_penalty"]]
result['sampler_priority'] = '\n'.join(samplers)
return result
def presets_params():
return [k for k in default_preset()]

View file

@ -60,7 +60,6 @@ settings = {
'custom_stopping_strings': '',
'custom_token_bans': '',
'negative_prompt': '',
'autoload_model': False,
'dark_theme': True,
'default_extensions': [],
'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}",
@ -88,7 +87,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft
# Model loader
group = parser.add_argument_group('Model loader')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ, TensorRT-LLM.')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.')
# Transformers/Accelerate
group = parser.add_argument_group('Transformers/Accelerate')
@ -121,7 +120,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
@ -130,9 +129,9 @@ group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to
group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
# Cache
group = parser.add_argument_group('Context and cache management')
group = parser.add_argument_group('Context and cache')
group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
# Speculative decoding
group = parser.add_argument_group('Speculative decoding')
@ -153,18 +152,10 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n
group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
# HQQ
group = parser.add_argument_group('HQQ')
group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
# TensorRT-LLM
group = parser.add_argument_group('TensorRT-LLM')
group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
# Cache
group = parser.add_argument_group('Cache')
group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
# DeepSpeed
group = parser.add_argument_group('DeepSpeed')
group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@ -190,6 +181,7 @@ group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certific
group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None)
group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy')
group.add_argument('--old-colors', action='store_true', help='Use the legacy Gradio colors, before the December/2024 update.')
group.add_argument('--portable', action='store_true', help='Hide features not available in portable mode like training.')
# API
group = parser.add_argument_group('API')
@ -267,8 +259,6 @@ def fix_loader_name(name):
return 'ExLlamav2_HF'
elif name in ['exllamav3-hf', 'exllamav3_hf', 'exllama-v3-hf', 'exllama_v3_hf', 'exllama-v3_hf', 'exllama3-hf', 'exllama3_hf', 'exllama-3-hf', 'exllama_3_hf', 'exllama-3_hf']:
return 'ExLlamav3_HF'
elif name in ['hqq']:
return 'HQQ'
elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
return 'TensorRT-LLM'
@ -311,11 +301,13 @@ if args.api or args.public_api:
add_extension('openai', last=True)
# Load model-specific settings
with Path(f'{args.model_dir}/config.yaml') as p:
p = Path(f'{args.model_dir}/config.yaml')
if p.exists():
model_config = yaml.safe_load(open(p, 'r').read())
else:
model_config = {}
del p
# Load custom model-specific settings
user_config = load_user_config()

View file

@ -1,15 +1,15 @@
from pathlib import Path
import torch
import tensorrt_llm
import torch
from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
from modules import shared
from modules.logging_colors import logger
from modules.text_generation import (
get_max_prompt_length,
get_reply_from_output_ids
)
from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
class TensorRTLLMModel:

View file

@ -14,6 +14,7 @@ from modules.callbacks import Iteratorize
from modules.extensions import apply_extensions
from modules.html_generator import generate_basic_html
from modules.logging_colors import logger
from modules.utils import check_model_loaded
def generate_reply(*args, **kwargs):
@ -34,8 +35,8 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
# Find the appropriate generation function
generate_func = apply_extensions('custom_generate_reply')
if generate_func is None:
if shared.model_name == 'None' or shared.model is None:
logger.error("No model is loaded! Select one in the Model tab.")
model_is_loaded, error_message = check_model_loaded()
if not model_is_loaded:
yield ''
return
@ -471,7 +472,7 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
t1 = time.time()
original_tokens = len(original_input_ids[0])
new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
return
@ -480,7 +481,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
For models that do not use the transformers library for sampling
"""
seed = set_manual_seed(state['seed'])
state['seed'] = set_manual_seed(state['seed'])
t0 = time.time()
reply = ''
try:
@ -500,15 +501,15 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
t1 = time.time()
original_tokens = len(encode(original_question)[0])
new_tokens = len(encode(original_question + reply)[0]) - original_tokens
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {state["seed"]})')
return
def print_prompt(prompt, max_chars=2000):
def print_prompt(prompt, max_chars=-1):
DARK_YELLOW = "\033[38;5;3m"
RESET = "\033[0m"
if len(prompt) > max_chars:
if max_chars > 0 and len(prompt) > max_chars:
half_chars = max_chars // 2
hidden_len = len(prompt[half_chars:-half_chars])
hidden_msg = f"{DARK_YELLOW}[...{hidden_len} characters hidden...]{RESET}"

View file

@ -61,7 +61,7 @@ if not shared.args.old_colors:
background_fill_primary_dark='var(--darker-gray)',
body_background_fill="white",
block_background_fill="transparent",
body_text_color="#333",
body_text_color='rgb(64, 64, 64)',
button_secondary_background_fill="#f4f4f4",
button_secondary_border_color="var(--border-color-primary)",
@ -71,6 +71,7 @@ if not shared.args.old_colors:
block_background_fill_dark='transparent',
block_border_color_dark='transparent',
input_border_color_dark='var(--border-color-dark)',
input_border_color_focus_dark='var(--border-color-dark)',
checkbox_border_color_dark='var(--border-color-dark)',
border_color_primary_dark='var(--border-color-dark)',
button_secondary_border_color_dark='var(--border-color-dark)',
@ -89,6 +90,8 @@ if not shared.args.old_colors:
checkbox_label_shadow='none',
block_shadow='none',
block_shadow_dark='none',
input_shadow_focus='none',
input_shadow_focus_dark='none',
button_large_radius='0.375rem',
button_large_padding='6px 12px',
input_radius='0.375rem',
@ -105,11 +108,10 @@ def list_model_elements():
'filter_by_loader',
'loader',
'cpu_memory',
'n_gpu_layers',
'gpu_layers',
'threads',
'threads_batch',
'batch_size',
'hqq_backend',
'ctx_size',
'cache_type',
'tensor_split',
@ -211,6 +213,15 @@ def list_interface_input_elements():
'negative_prompt',
'dry_sequence_breakers',
'grammar_string',
'navigate_message_index',
'navigate_direction',
'navigate_message_role',
'edit_message_index',
'edit_message_text',
'edit_message_role',
'branch_index',
'enable_web_search',
'web_search_pages',
]
# Chat elements

View file

@ -24,7 +24,8 @@ def create_ui():
with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
with gr.Column():
with gr.Row(elem_id='past-chats-buttons'):
shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', interactive=not mu)
shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', elem_id='Branch', interactive=not mu)
shared.gradio['branch_index'] = gr.Number(value=-1, precision=0, visible=False, elem_id="Branch-index", interactive=True)
shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input'])
@ -46,14 +47,14 @@ def create_ui():
with gr.Row():
with gr.Column(elem_id='chat-col'):
shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
shared.gradio['display'] = gr.JSON(value={}, visible=False) # Hidden buffer
shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': [], 'metadata': {}}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
with gr.Row(elem_id="chat-input-row"):
with gr.Column(scale=1, elem_id='gr-hover-container'):
gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')
with gr.Column(scale=10, elem_id='chat-input-container'):
shared.gradio['textbox'] = gr.Textbox(label='', placeholder='Send a message', elem_id='chat-input', elem_classes=['add_scrollbar'])
shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar'])
shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls')
shared.gradio['typing-dots'] = gr.HTML(value='<div class="typing"><span></span><span class="dot1"></span><span class="dot2"></span></div>', label='typing', elem_id='typing-container')
@ -70,8 +71,6 @@ def create_ui():
shared.gradio['Remove last'] = gr.Button('Remove last reply (Ctrl + Shift + Backspace)', elem_id='Remove-last')
with gr.Row():
shared.gradio['Replace last reply'] = gr.Button('Replace last reply (Ctrl + Shift + L)', elem_id='Replace-last')
shared.gradio['Copy last reply'] = gr.Button('Copy last reply (Ctrl + Shift + K)', elem_id='Copy-last')
shared.gradio['Impersonate'] = gr.Button('Impersonate (Ctrl + Shift + M)', elem_id='Impersonate')
with gr.Row():
@ -79,14 +78,20 @@ def create_ui():
shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
with gr.Row():
shared.gradio['send-chat-to-default'] = gr.Button('Send to default')
shared.gradio['send-chat-to-notebook'] = gr.Button('Send to notebook')
shared.gradio['send-chat-to-default'] = gr.Button('Send to Default')
shared.gradio['send-chat-to-notebook'] = gr.Button('Send to Notebook')
with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']):
with gr.Column():
with gr.Row():
shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
with gr.Row():
shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search')
with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']:
shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10)
with gr.Row():
shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
@ -96,6 +101,22 @@ def create_ui():
with gr.Row():
shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
with gr.Row():
shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm')
shared.gradio['token_display'] = gr.HTML(value='', elem_classes='token-display')
# Hidden elements for version navigation and editing
with gr.Row(visible=False):
shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index")
shared.gradio['navigate_direction'] = gr.Textbox(value="", elem_id="Navigate-direction")
shared.gradio['navigate_message_role'] = gr.Textbox(value="", elem_id="Navigate-message-role")
shared.gradio['navigate_version'] = gr.Button(elem_id="Navigate-version")
shared.gradio['edit_message_index'] = gr.Number(value=-1, precision=0, elem_id="Edit-message-index")
shared.gradio['edit_message_text'] = gr.Textbox(value="", elem_id="Edit-message-text")
shared.gradio['edit_message_role'] = gr.Textbox(value="", elem_id="Edit-message-role")
shared.gradio['edit_message'] = gr.Button(elem_id="Edit-message")
def create_chat_settings_ui():
mu = shared.args.multi_user
@ -185,7 +206,7 @@ def create_event_handlers():
shared.gradio['Generate'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
@ -193,7 +214,7 @@ def create_event_handlers():
shared.gradio['textbox'].submit(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
@ -221,10 +242,6 @@ def create_event_handlers():
None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
shared.gradio['Replace last reply'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
chat.handle_replace_last_reply_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False)
shared.gradio['Send dummy message'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
chat.handle_send_dummy_message_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False)
@ -258,7 +275,7 @@ def create_event_handlers():
shared.gradio['branch_chat'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id', 'branch_index'), show_progress=False)
shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename-row'), show_progress=False)
shared.gradio['rename_to-cancel'].click(lambda: gr.update(visible=False), None, gradio('rename-row'), show_progress=False)
@ -290,7 +307,14 @@ def create_event_handlers():
None, gradio('mode'), None, js="(mode) => {mode === 'instruct' ? document.getElementById('character-menu').parentNode.parentNode.style.display = 'none' : document.getElementById('character-menu').parentNode.parentNode.style.display = ''}")
shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'), show_progress=False)
shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
shared.gradio['navigate_version'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
chat.handle_navigate_version_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False)
shared.gradio['edit_message'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False)
# Save/delete a character
shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False)
@ -347,3 +371,13 @@ def create_event_handlers():
None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
shared.gradio['show_controls'].change(None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
shared.gradio['count_tokens'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
chat.count_prompt_tokens, gradio('textbox', 'interface_state'), gradio('token_display'), show_progress=False)
shared.gradio['enable_web_search'].change(
lambda x: gr.update(visible=x),
gradio('enable_web_search'),
gradio('web_search_row')
)

View file

@ -14,6 +14,7 @@ from modules.models_settings import (
get_model_metadata,
save_instruction_template,
save_model_settings,
update_gpu_layers_and_vram,
update_model_parameters
)
from modules.utils import gradio
@ -23,38 +24,58 @@ def create_ui():
mu = shared.args.multi_user
with gr.Tab("Model", elem_id="model-tab"):
with gr.Row():
with gr.Column():
with gr.Row():
with gr.Column():
with gr.Row():
shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button', interactive=not mu)
shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu)
shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
with gr.Column():
with gr.Row():
shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
with gr.Row():
with gr.Column():
shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None)
shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None)
with gr.Blocks():
gr.Markdown("## Main options")
with gr.Row():
with gr.Column():
shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
with gr.Column():
shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
# Speculative decoding
with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
with gr.Row():
shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding. Recommended value: 4.')
shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
gr.Markdown("## Other options")
with gr.Accordion("See more options", open=False, elem_classes='tgw-accordion'):
with gr.Row():
with gr.Column():
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
@ -65,12 +86,6 @@ def create_ui():
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
with gr.Column():
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
@ -78,36 +93,20 @@ def create_ui():
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
# Speculative decoding
with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
if not shared.args.portable:
with gr.Row():
shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')
shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
with gr.Column():
with gr.Row():
shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu)
with gr.Tab("Download"):
shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu)
shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1, interactive=not mu)
@ -132,11 +131,10 @@ def create_event_handlers():
# In this event handler, the interface state is read and updated
# with the model defaults (if any), and then the model is loaded
# unless "autoload_model" is unchecked
shared.gradio['model_menu'].change(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then(
load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=True).success(
partial(load_model_wrapper, autoload=False), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
shared.gradio['load_model'].click(
@ -145,15 +143,31 @@ def create_event_handlers():
partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False)
shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False).then(
partial(update_gpu_layers_and_vram, auto_adjust=True), gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info', 'gpu_layers'), show_progress=False)
shared.gradio['save_model_settings'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
# For ctx_size and cache_type - auto-adjust GPU layers
for param in ['ctx_size', 'cache_type']:
shared.gradio[param].change(
partial(update_gpu_layers_and_vram, auto_adjust=True),
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
gradio('vram_info', 'gpu_layers'), show_progress=False)
# For manual gpu_layers changes - only update VRAM
shared.gradio['gpu_layers'].change(
partial(update_gpu_layers_and_vram, auto_adjust=False),
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
gradio('vram_info'), show_progress=False)
if not shared.args.portable:
shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))
shared.gradio['customized_template_submit'].click(save_instruction_template, gradio('model_menu', 'customized_template'), gradio('model_status'), show_progress=True)
@ -192,6 +206,26 @@ def load_lora_wrapper(selected_loras):
def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):
try:
# Handle direct GGUF URLs
if repo_id.startswith("https://") and ("huggingface.co" in repo_id) and (repo_id.endswith(".gguf") or repo_id.endswith(".gguf?download=true")):
try:
path = repo_id.split("huggingface.co/")[1]
# Extract the repository ID (first two parts of the path)
parts = path.split("/")
if len(parts) >= 2:
extracted_repo_id = f"{parts[0]}/{parts[1]}"
# Extract the filename (last part of the path)
filename = repo_id.split("/")[-1]
if "?download=true" in filename:
filename = filename.replace("?download=true", "")
repo_id = extracted_repo_id
specific_file = filename
except:
pass
if repo_id == "":
yield ("Please enter a model path")
return
@ -205,6 +239,18 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
yield ("Getting the download links from Hugging Face")
links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file)
# Check for multiple GGUF files
gguf_files = [link for link in links if link.lower().endswith('.gguf')]
if len(gguf_files) > 1 and not specific_file:
output = "Multiple GGUF files found. Please copy one of the following filenames to the 'File name' field:\n\n```\n"
for link in gguf_files:
output += f"{Path(link).name}\n"
output += "```"
yield output
return
if return_links:
output = "```\n"
for link in links:
@ -252,10 +298,34 @@ def update_truncation_length(current_length, state):
return current_length
def get_initial_vram_info():
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
return update_gpu_layers_and_vram(
shared.args.loader,
shared.model_name,
shared.args.gpu_layers,
shared.args.ctx_size,
shared.args.cache_type,
auto_adjust=False,
for_ui=True
)
return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</div>"
def get_initial_gpu_layers_max():
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
model_settings = get_model_metadata(shared.model_name)
return model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', 256))
return 256
def handle_load_model_event_initial(model, state):
state = apply_model_settings_to_state(model, state)
output = ui.apply_interface_values(state)
update_model_parameters(state)
update_model_parameters(state) # This updates the command-line flags
return output + [state]

View file

@ -21,7 +21,7 @@ def create_ui(default_preset):
shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button')
with gr.Column():
shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown')
shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()) if not shared.args.portable else ['llama.cpp'], value="All", elem_classes='slim-dropdown')
with gr.Row():
with gr.Column():
@ -82,7 +82,7 @@ def create_ui(default_preset):
shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='enable_thinking', info='Used by Qwen3 to toggle <think> mode.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Qwen3 to toggle <think> mode.')
shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')

View file

@ -23,10 +23,14 @@ def create_ui():
shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table')
with gr.Column():
if not shared.args.portable:
extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️ WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu)
extension_status = gr.Markdown()
else:
pass
shared.gradio['theme_state'] = gr.Textbox(visible=False, value='dark' if shared.settings['dark_theme'] else 'light')
if not shared.args.portable:
extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False)
# Reset interface event

View file

@ -72,6 +72,20 @@ def natural_keys(text):
return [atoi(c) for c in re.split(r'(\d+)', text)]
def check_model_loaded():
if shared.model_name == 'None' or shared.model is None:
if len(get_available_models()) == 0:
error_msg = "No model is loaded.\n\nTo get started:\n1) Place a GGUF file in your user_data/models folder\n2) Go to the Model tab and select it"
logger.error(error_msg)
return False, error_msg
else:
error_msg = "No model is loaded. Please select one in the Model tab."
logger.error(error_msg)
return False, error_msg
return True, None
def get_available_models():
# Get all GGUF files
gguf_files = get_available_ggufs()
@ -123,7 +137,7 @@ def get_available_models():
model_dirs = sorted(model_dirs, key=natural_keys)
return ['None'] + filtered_gguf_files + model_dirs
return filtered_gguf_files + model_dirs
def get_available_ggufs():

129
modules/web_search.py Normal file
View file

@ -0,0 +1,129 @@
import concurrent.futures
from concurrent.futures import as_completed
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from modules.logging_colors import logger
def get_current_timestamp():
"""Returns the current time in 24-hour format"""
return datetime.now().strftime('%b %d, %Y %H:%M')
def download_web_page(url, timeout=5):
"""Download and extract text from a web page"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get text and clean it up
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
logger.error(f"Error downloading {url}: {e}")
return f"[Error downloading content from {url}: {str(e)}]"
def perform_web_search(query, num_pages=3, max_workers=5):
"""Perform web search and return results with content"""
try:
with DDGS() as ddgs:
results = list(ddgs.text(query, max_results=num_pages))
# Prepare download tasks
download_tasks = []
for i, result in enumerate(results):
url = result.get('href', '')
title = result.get('title', f'Search Result {i+1}')
download_tasks.append((url, title, i))
search_results = [None] * len(download_tasks) # Pre-allocate to maintain order
# Download pages in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all download tasks
future_to_task = {
executor.submit(download_web_page, task[0]): task
for task in download_tasks
}
# Collect results as they complete
for future in as_completed(future_to_task):
url, title, index = future_to_task[future]
try:
content = future.result()
search_results[index] = {
'title': title,
'url': url,
'content': content
}
except Exception as e:
logger.error(f"Error downloading {url}: {e}")
# Include failed downloads with empty content
search_results[index] = {
'title': title,
'url': url,
'content': ''
}
return search_results
except Exception as e:
logger.error(f"Error performing web search: {e}")
return []
def add_web_search_attachments(history, row_idx, user_message, search_query, state):
"""Perform web search and add results as attachments"""
if not search_query:
logger.warning("No search query provided")
return
try:
logger.info(f"Using search query: {search_query}")
# Perform web search
num_pages = int(state.get('web_search_pages', 3))
search_results = perform_web_search(search_query, num_pages)
if not search_results:
logger.warning("No search results found")
return
# Add search results as attachments
key = f"user_{row_idx}"
if key not in history['metadata']:
history['metadata'][key] = {"timestamp": get_current_timestamp()}
if "attachments" not in history['metadata'][key]:
history['metadata'][key]["attachments"] = []
for result in search_results:
attachment = {
"name": result['title'],
"type": "text/html",
"url": result['url'],
"content": result['content']
}
history['metadata'][key]["attachments"].append(attachment)
logger.info(f"Added {len(search_results)} web search results as attachments")
except Exception as e:
logger.error(f"Error in web search: {e}")

View file

@ -126,7 +126,7 @@ def check_env():
sys.exit(1)
# Ensure this is a new environment and not the base environment
if os.environ["CONDA_DEFAULT_ENV"] == "base":
if os.environ.get("CONDA_DEFAULT_ENV", "") == "base":
print("Create an environment for this project and activate it. Exiting...")
sys.exit(1)
@ -222,7 +222,7 @@ def update_pytorch_and_python():
if "+cu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
elif "+rocm" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4"
elif "+cpu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
elif "+cxx11" in torver:
@ -273,7 +273,7 @@ def install_webui():
"What is your GPU?",
{
'A': 'NVIDIA - CUDA 12.4',
'B': 'AMD - Linux/macOS only, requires ROCm 6.1',
'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4',
'C': 'Apple M Series',
'D': 'Intel Arc (beta)',
'N': 'CPU mode'
@ -314,7 +314,7 @@ def install_webui():
if selected_gpu == "NVIDIA":
install_pytorch += "--index-url https://download.pytorch.org/whl/cu124"
elif selected_gpu == "AMD":
install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.1"
install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.2.4"
elif selected_gpu in ["APPLE", "NONE"]:
install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
elif selected_gpu == "INTEL":

View file

@ -1,7 +1,9 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
bitsandbytes==0.45.*
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -13,6 +15,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -30,12 +33,12 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -29,6 +32,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -29,6 +32,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -29,7 +32,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -29,8 +32,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -29,5 +32,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -29,5 +32,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -1,7 +1,9 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
bitsandbytes==0.45.*
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -13,6 +15,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -30,12 +33,12 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,18 +0,0 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,18 +0,0 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,6 +18,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -51,6 +51,7 @@ from modules.models import load_model, unload_model_if_idle
from modules.models_settings import (
get_fallback_settings,
get_model_metadata,
update_gpu_layers_and_vram,
update_model_parameters
)
from modules.shared import do_cmd_flags_warnings
@ -90,7 +91,7 @@ def create_interface():
'instruction_template_str': shared.settings['instruction_template_str'],
'prompt_menu-default': shared.settings['prompt-default'],
'prompt_menu-notebook': shared.settings['prompt-notebook'],
'filter_by_loader': shared.args.loader or 'All'
'filter_by_loader': (shared.args.loader or 'All') if not shared.args.portable else 'llama.cpp'
})
if Path("user_data/cache/pfp_character.png").exists():
@ -127,6 +128,7 @@ def create_interface():
ui_parameters.create_ui(shared.settings['preset']) # Parameters tab
ui_model_menu.create_ui() # Model tab
if not shared.args.portable:
training.create_ui() # Training tab
ui_session.create_ui() # Session tab
@ -247,6 +249,20 @@ if __name__ == "__main__":
model_settings = get_model_metadata(model_name)
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
# Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
vram_usage, adjusted_layers = update_gpu_layers_and_vram(
shared.args.loader,
model_name,
model_settings['gpu_layers'],
shared.args.ctx_size,
shared.args.cache_type,
auto_adjust=True,
for_ui=False
)
shared.args.gpu_layers = adjusted_layers
# Load the model
shared.model, shared.tokenizer = load_model(model_name)
if shared.args.lora:

View file

@ -1,10 +1,15 @@
#!/usr/bin/env bash
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
cd "$(dirname "${BASH_SOURCE[0]}")"
# Portable install case
if [ -d "portable_env" ]; then
./portable_env/bin/python3 server.py --api --auto-launch "$@"
./portable_env/bin/python3 server.py --portable --api --auto-launch "$@"
exit $?
fi
@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
exit
fi
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
export CUDA_PATH="$INSTALL_ENV_DIR"
export CUDA_HOME="$CUDA_PATH"

View file

@ -1,10 +1,15 @@
#!/bin/bash
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
cd "$(dirname "${BASH_SOURCE[0]}")"
# Portable install case
if [ -d "portable_env" ]; then
./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@"
./portable_env/bin/python3 server.py --portable --api --auto-launch --api-port 5005 "$@"
exit $?
fi
@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
exit
fi
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
export CUDA_PATH="$INSTALL_ENV_DIR"
export CUDA_HOME="$CUDA_PATH"

View file

@ -1,11 +1,16 @@
@echo off
setlocal enabledelayedexpansion
@rem environment isolation
set PYTHONNOUSERSITE=1
set PYTHONPATH=
set PYTHONHOME=
cd /D "%~dp0"
@rem Portable install case
if exist "portable_env" (
.\portable_env\python.exe server.py --api --auto-launch %*
.\portable_env\python.exe server.py --portable --api --auto-launch %*
exit /b %errorlevel%
)
@ -87,10 +92,6 @@ if not exist "%INSTALL_ENV_DIR%" (
@rem check if conda environment was actually created
if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
@rem environment isolation
set PYTHONNOUSERSITE=1
set PYTHONPATH=
set PYTHONHOME=
set "CUDA_PATH=%INSTALL_ENV_DIR%"
set "CUDA_HOME=%CUDA_PATH%"

View file

@ -31,7 +31,6 @@ seed: -1
custom_stopping_strings: ''
custom_token_bans: ''
negative_prompt: ''
autoload_model: false
dark_theme: true
default_extensions: []
instruction_template_str: |-