Merge pull request #6912 from oobabooga/dev

Merge dev branch
This commit is contained in:
oobabooga 2025-04-27 00:03:16 -03:00 committed by GitHub
commit 9bb9ce079e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
158 changed files with 893 additions and 732 deletions

View file

@ -101,7 +101,7 @@ jobs:
- name: Build Package
shell: bash
run: |
rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
# Define common variables
CUDA_VERSION="${{ matrix.cuda }}"

View file

@ -100,7 +100,7 @@ jobs:
- name: Build Package
shell: bash
run: |
rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
# Define common variables
AVX_SUPPORT="${{ matrix.avx }}"

View file

@ -100,7 +100,7 @@ jobs:
- name: Build Package
shell: bash
run: |
rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
# Define common variables
AVX_SUPPORT="${{ matrix.avx }}"

20
.gitignore vendored
View file

@ -1,26 +1,8 @@
/cache
/characters
/css
/extensions
/grammars
/installer_files
/logs
/loras
/models
/presets
/prompts
/repositories
/softprompts
/torch-dumps
/training/datasets
/CMD_FLAGS.txt
/img_bot*
/img_me*
/models/config-user.yaml
/notification.mp3
/settings*.json
/settings*.yaml
/user_data
.chroma
.DS_Store

View file

@ -1,3 +0,0 @@
# Only used by the one-click installer.
# Example:
# --listen --api

242
README.md
View file

@ -43,7 +43,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases
To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again.
You can use command-line flags, like `./start_linux.sh --help`, or add them to `CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
<details>
<summary>
@ -157,7 +157,7 @@ mkdir -p logs cache
# TORCH_CUDA_ARCH_LIST based on your GPU model
# APP_RUNTIME_GID your host user's group id (run `id -g` in a terminal)
# BUILD_EXTENIONS optionally add comma separated list of extensions to build
# Edit CMD_FLAGS.txt and add in it the options you want to execute (like --listen --cpu)
# Edit user_data/CMD_FLAGS.txt and add in it the options you want to execute (like --listen --cpu)
#
docker compose up --build
```
@ -182,131 +182,139 @@ List of command-line flags
</summary>
```txt
usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--settings SETTINGS]
[--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--auto-devices] [--gpu-memory GPU_MEMORY [GPU_MEMORY ...]]
[--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast]
[--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn]
[--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT]
[--numa] [--no-kv-offload] [--row-split] [--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa]
[--num_experts_per_token NUM_EXPERTS_PER_TOKEN] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner] [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
[--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
[--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
[--subpath SUBPATH] [--old-colors] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6]
[--api-disable-ipv4] [--nowebui]
usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
[--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
[--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit]
[--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap]
[--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N]
[--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
[--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner]
[--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
[--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
[--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--api] [--public-api]
[--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
Text generation web UI
options:
-h, --help show this help message and exit
-h, --help show this help message and exit
Basic settings:
--multi-user Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.
--character CHARACTER The name of the character to load in chat mode by default.
--model MODEL Name of the model to load by default.
--lora LORA [LORA ...] The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.
--model-dir MODEL_DIR Path to directory with all the models.
--lora-dir LORA_DIR Path to directory with all the loras.
--settings SETTINGS Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this
file will be loaded by default without the need to use the --settings flag.
--extensions EXTENSIONS [EXTENSIONS ...] The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.
--verbose Print the prompts to the terminal.
--idle-timeout IDLE_TIMEOUT Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
--multi-user Multi-user mode. Chat histories are not saved or automatically loaded. Warning: this is likely not safe for sharing publicly.
--character CHARACTER The name of the character to load in chat mode by default.
--model MODEL Name of the model to load by default.
--lora LORA [LORA ...] The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.
--model-dir MODEL_DIR Path to directory with all the models.
--lora-dir LORA_DIR Path to directory with all the loras.
--model-menu Show a model menu in the terminal when the web UI is first launched.
--settings SETTINGS Load the default interface settings from this yaml file. See user_data/settings-template.yaml for an example. If you create a file called
user_data/settings.yaml, this file will be loaded by default without the need to use the --settings flag.
--extensions EXTENSIONS [EXTENSIONS ...] The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.
--verbose Print the prompts to the terminal.
--idle-timeout IDLE_TIMEOUT Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
Model loader:
--loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2,
HQQ, TensorRT-LLM.
--loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ,
TensorRT-LLM.
Transformers/Accelerate:
--cpu Use the CPU to generate text. Warning: Training on CPU is extremely slow.
--auto-devices Automatically split the model across the available GPU(s) and CPU.
--gpu-memory GPU_MEMORY [GPU_MEMORY ...] Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values
in MiB like --gpu-memory 3500MiB.
--cpu-memory CPU_MEMORY Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.
--disk If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.
--disk-cache-dir DISK_CACHE_DIR Directory to save the disk cache to. Defaults to "cache".
--load-in-8bit Load the model with 8-bit precision (using bitsandbytes).
--bf16 Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.
--no-cache Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.
--trust-remote-code Set trust_remote_code=True while loading the model. Necessary for some models.
--force-safetensors Set use_safetensors=True while loading the model. This prevents arbitrary code execution.
--no_use_fast Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast.
--use_flash_attention_2 Set use_flash_attention_2=True while loading the model.
--use_eager_attention Set attn_implementation= eager while loading the model.
--torch-compile Compile the model with torch.compile for improved performance.
--cpu Use the CPU to generate text. Warning: Training on CPU is extremely slow.
--cpu-memory CPU_MEMORY Maximum CPU memory in GiB. Use this for CPU offloading.
--disk If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.
--disk-cache-dir DISK_CACHE_DIR Directory to save the disk cache to. Defaults to "user_data/cache".
--load-in-8bit Load the model with 8-bit precision (using bitsandbytes).
--bf16 Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.
--no-cache Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.
--trust-remote-code Set trust_remote_code=True while loading the model. Necessary for some models.
--force-safetensors Set use_safetensors=True while loading the model. This prevents arbitrary code execution.
--no_use_fast Set use_fast=False while loading the tokenizer (it's True by default). Use this if you have any problems related to use_fast.
--use_flash_attention_2 Set use_flash_attention_2=True while loading the model.
--use_eager_attention Set attn_implementation= eager while loading the model.
--torch-compile Compile the model with torch.compile for improved performance.
bitsandbytes 4-bit:
--load-in-4bit Load the model with 4-bit precision (using bitsandbytes).
--use_double_quant use_double_quant for 4-bit.
--compute_dtype COMPUTE_DTYPE compute dtype for 4-bit. Valid options: bfloat16, float16, float32.
--quant_type QUANT_TYPE quant_type for 4-bit. Valid options: nf4, fp4.
--load-in-4bit Load the model with 4-bit precision (using bitsandbytes).
--use_double_quant use_double_quant for 4-bit.
--compute_dtype COMPUTE_DTYPE compute dtype for 4-bit. Valid options: bfloat16, float16, float32.
--quant_type QUANT_TYPE quant_type for 4-bit. Valid options: nf4, fp4.
llama.cpp:
--flash-attn Use flash-attention.
--n_ctx N_CTX Size of the prompt context.
--threads THREADS Number of threads to use.
--threads-batch THREADS_BATCH Number of threads to use for batches/prompt processing.
--batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval.
--no-mmap Prevent mmap from being used.
--mlock Force the system to keep the model in RAM.
--n-gpu-layers N_GPU_LAYERS Number of layers to offload to the GPU.
--tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
--numa Activate NUMA task allocation for llama.cpp.
--no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
--row-split Split the model by rows across GPUs. This may improve multi-gpu performance.
--flash-attn Use flash-attention.
--threads THREADS Number of threads to use.
--threads-batch THREADS_BATCH Number of threads to use for batches/prompt processing.
--batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval.
--no-mmap Prevent mmap from being used.
--mlock Force the system to keep the model in RAM.
--n-gpu-layers N_GPU_LAYERS Number of layers to offload to the GPU.
--tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
--numa Activate NUMA task allocation for llama.cpp.
--no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
--row-split Split the model by rows across GPUs. This may improve multi-gpu performance.
--extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"
--streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
Context and cache management:
--ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens.
Speculative decoding:
--model-draft MODEL_DRAFT Path to the draft model for speculative decoding.
--draft-max DRAFT_MAX Number of tokens to draft for speculative decoding.
--gpu-layers-draft GPU_LAYERS_DRAFT Number of layers to offload to the GPU for the draft model.
--device-draft DEVICE_DRAFT Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1
--ctx-size-draft CTX_SIZE_DRAFT Size of the prompt context for the draft model. If 0, uses the same as the main model.
ExLlamaV2:
--gpu-split GPU_SPLIT Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.
--autosplit Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.
--max_seq_len MAX_SEQ_LEN Maximum sequence length.
--cfg-cache ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.
--no_flash_attn Force flash-attention to not be used.
--no_xformers Force xformers to not be used.
--no_sdpa Force Torch SDPA to not be used.
--num_experts_per_token NUM_EXPERTS_PER_TOKEN Number of experts to use for generation. Applies to MoE models like Mixtral.
--enable_tp Enable Tensor Parallelism (TP) in ExLlamaV2.
--gpu-split GPU_SPLIT Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.
--autosplit Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.
--cfg-cache ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.
--no_flash_attn Force flash-attention to not be used.
--no_xformers Force xformers to not be used.
--no_sdpa Force Torch SDPA to not be used.
--num_experts_per_token N Number of experts to use for generation. Applies to MoE models like Mixtral.
--enable_tp Enable Tensor Parallelism (TP) in ExLlamaV2.
HQQ:
--hqq-backend HQQ_BACKEND Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
--hqq-backend HQQ_BACKEND Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
TensorRT-LLM:
--cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
--cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
Cache:
--cache_type CACHE_TYPE KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
--cache_type CACHE_TYPE KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
DeepSpeed:
--deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
--nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
--local_rank LOCAL_RANK DeepSpeed: Optional argument for distributed setups.
--deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
--nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
--local_rank LOCAL_RANK DeepSpeed: Optional argument for distributed setups.
RoPE:
--alpha_value ALPHA_VALUE Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
--rope_freq_base ROPE_FREQ_BASE If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).
--compress_pos_emb COMPRESS_POS_EMB Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.
--alpha_value ALPHA_VALUE Positional embeddings alpha factor for NTK RoPE scaling. Use either this or compress_pos_emb, not both.
--rope_freq_base ROPE_FREQ_BASE If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63).
--compress_pos_emb COMPRESS_POS_EMB Positional embeddings compression factor. Should be set to (context length) / (model's original context length). Equal to 1/rope_freq_scale.
Gradio:
--listen Make the web UI reachable from your local network.
--listen-port LISTEN_PORT The listening port that the server will use.
--listen-host LISTEN_HOST The hostname that the server will use.
--share Create a public URL. This is useful for running the web UI on Google Colab or similar.
--auto-launch Open the web UI in the default browser upon launch.
--gradio-auth GRADIO_AUTH Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3".
--gradio-auth-path GRADIO_AUTH_PATH Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.
--ssl-keyfile SSL_KEYFILE The path to the SSL certificate key file.
--ssl-certfile SSL_CERTFILE The path to the SSL certificate cert file.
--subpath SUBPATH Customize the subpath for gradio, use with reverse proxy
--old-colors Use the legacy Gradio colors, before the December/2024 update.
--listen Make the web UI reachable from your local network.
--listen-port LISTEN_PORT The listening port that the server will use.
--listen-host LISTEN_HOST The hostname that the server will use.
--share Create a public URL. This is useful for running the web UI on Google Colab or similar.
--auto-launch Open the web UI in the default browser upon launch.
--gradio-auth GRADIO_AUTH Set Gradio authentication password in the format "username:password". Multiple credentials can also be supplied with "u1:p1,u2:p2,u3:p3".
--gradio-auth-path GRADIO_AUTH_PATH Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.
--ssl-keyfile SSL_KEYFILE The path to the SSL certificate key file.
--ssl-certfile SSL_CERTFILE The path to the SSL certificate cert file.
--subpath SUBPATH Customize the subpath for gradio, use with reverse proxy
--old-colors Use the legacy Gradio colors, before the December/2024 update.
API:
--api Enable the API extension.
--public-api Create a public URL for the API using Cloudfare.
--public-api-id PUBLIC_API_ID Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.
--api-port API_PORT The listening port for the API.
--api-key API_KEY API authentication key.
--admin-key ADMIN_KEY API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.
--api-enable-ipv6 Enable IPv6 for the API
--api-disable-ipv4 Disable IPv4 for the API
--nowebui Do not launch the Gradio UI. Useful for launching the API in standalone mode.
--api Enable the API extension.
--public-api Create a public URL for the API using Cloudfare.
--public-api-id PUBLIC_API_ID Tunnel ID for named Cloudflare Tunnel. Use together with public-api option.
--api-port API_PORT The listening port for the API.
--api-key API_KEY API authentication key.
--admin-key ADMIN_KEY API authentication key for admin tasks like loading and unloading models. If not set, will be the same as --api-key.
--api-enable-ipv6 Enable IPv6 for the API
--api-disable-ipv4 Disable IPv4 for the API
--nowebui Do not launch the Gradio UI. Useful for launching the API in standalone mode.
```
</details>
@ -317,35 +325,37 @@ https://github.com/oobabooga/text-generation-webui/wiki
## Downloading models
Models should be placed in the folder `text-generation-webui/models`. They are usually downloaded from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads).
Models should be placed in the folder `text-generation-webui/user_data/models`. They are usually downloaded from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads).
* GGUF models are a single file and should be placed directly into `models`. Example:
* GGUF models are a single file and should be placed directly into `user_data/models`. Example:
```
text-generation-webui
└── models
└── llama-2-13b-chat.Q4_K_M.gguf
└── user_data
└── models
└── llama-2-13b-chat.Q4_K_M.gguf
```
* The remaining model types (like 16-bit Transformers models and EXL2 models) are made of several files and must be placed in a subfolder. Example:
```
text-generation-webui
├── models
│   ├── lmsys_vicuna-33b-v1.3
│   │   ├── config.json
│   │   ├── generation_config.json
│   │   ├── pytorch_model-00001-of-00007.bin
│   │   ├── pytorch_model-00002-of-00007.bin
│   │   ├── pytorch_model-00003-of-00007.bin
│   │   ├── pytorch_model-00004-of-00007.bin
│   │   ├── pytorch_model-00005-of-00007.bin
│   │   ├── pytorch_model-00006-of-00007.bin
│   │   ├── pytorch_model-00007-of-00007.bin
│   │   ├── pytorch_model.bin.index.json
│   │   ├── special_tokens_map.json
│   │   ├── tokenizer_config.json
│   │   └── tokenizer.model
└── user_data
└── models
└── lmsys_vicuna-33b-v1.3
├── config.json
├── generation_config.json
├── pytorch_model-00001-of-00007.bin
├── pytorch_model-00002-of-00007.bin
├── pytorch_model-00003-of-00007.bin
├── pytorch_model-00004-of-00007.bin
├── pytorch_model-00005-of-00007.bin
├── pytorch_model-00006-of-00007.bin
├── pytorch_model-00007-of-00007.bin
├── pytorch_model.bin.index.json
├── special_tokens_map.json
├── tokenizer_config.json
└── tokenizer.model
```
In both cases, you can use the "Model" tab of the UI to download the model from Hugging Face automatically. It is also possible to download it via the command-line with:

View file

@ -1,11 +0,0 @@
@echo off
cd /D "%~dp0"
set PATH=%PATH%;%SystemRoot%\system32
@rem sed -i 's/\x0D$//' ./wsl.sh converts newlines to unix format in the wsl script
call wsl -e bash -lic "sed -i 's/\x0D$//' ./wsl.sh; source ./wsl.sh cmd"
:end
pause

View file

@ -625,19 +625,19 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
width: 100%;
overflow-y: visible;
}
.message {
break-inside: avoid;
}
.gradio-container {
overflow: visible;
}
.tab-nav {
display: none !important;
}
#chat-tab > :first-child {
max-width: unset;
}
@ -1291,3 +1291,94 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
.dark .footer-button:hover svg {
stroke: rgb(209 213 219);
}
.tgw-accordion {
padding: 10px 12px !important;
}
.dark .tgw-accordion {
border: 1px solid var(--border-color-dark);
}
.welcome-greeting {
text-align: center;
margin-top: 40vh;
font-size: 24px;
opacity: 0.7;
padding-left: 1rem;
padding-right: 1rem;
}
/* Thinking blocks styling */
.thinking-block {
margin-bottom: 12px;
border-radius: 8px;
border: 1px solid rgb(0 0 0 / 10%);
background-color: var(--light-theme-gray);
overflow: hidden;
}
.dark .thinking-block {
background-color: var(--darker-gray);
}
.thinking-header {
display: flex;
align-items: center;
padding: 10px 16px;
cursor: pointer;
user-select: none;
font-size: 14px;
color: rgb(0 0 0 / 70%);
transition: background-color 0.2s;
}
.thinking-header:hover {
background-color: rgb(0 0 0 / 3%);
}
.thinking-header::-webkit-details-marker {
display: none;
}
.thinking-icon {
margin-right: 8px;
color: rgb(0 0 0 / 50%);
}
.thinking-title {
font-weight: 500;
}
.thinking-content {
padding: 12px 16px;
border-top: 1px solid rgb(0 0 0 / 7%);
color: rgb(0 0 0 / 70%);
font-size: 14px;
line-height: 1.5;
overflow-wrap: break-word;
max-height: 250px;
overflow-y: scroll;
contain: layout;
}
/* Animation for opening thinking blocks */
@keyframes fadeIn {
from { opacity: 0; }
to { opacity: 1; }
}
.thinking-block[open] .thinking-content {
animation: fadeIn 0.3s ease-out;
}
/* Additional style for in-progress thinking */
.thinking-block[data-streaming="true"] .thinking-title {
animation: pulse 1.5s infinite;
}
@keyframes pulse {
0% { opacity: 0.6; }
50% { opacity: 1; }
100% { opacity: 0.6; }
}

View file

@ -1,146 +0,0 @@
## WSL instructions
If you do not have WSL installed, follow the [instructions below](https://github.com/oobabooga/text-generation-webui/wiki/10-%E2%80%90-WSL#wsl-installation) first.
### Additional WSL setup info
If you want to install Linux to a drive other than C, open powershell and enter these commands:
```
cd D:\Path\To\Linux
$ProgressPreference = 'SilentlyContinue'
Invoke-WebRequest -Uri <LinuxDistroURL> -OutFile Linux.appx -UseBasicParsing
mv Linux.appx Linux.zip
```
Then open Linux.zip and you should see several .appx files inside.
The one with _x64.appx contains the exe installer that you need.
Extract the contents of that _x64.appx file and run <distro>.exe to install.
Linux Distro URLs: https://learn.microsoft.com/en-us/windows/wsl/install-manual#downloading-distributions
**ENSURE THAT THE WSL LINUX DISTRO THAT YOU WISH TO USE IS SET AS THE DEFAULT!**
Do this by using these commands:
```
wsl -l
wsl -s <DistroName>
```
### Web UI Installation
Run the "start" script. By default it will install the web UI in WSL:
/home/{username}/text-gen-install
To launch the web UI in the future after it is already installed, run
the same "start" script. Ensure that one_click.py and wsl.sh are next to it!
### Updating the web UI
As an alternative to running the "update" script, you can also run "wsl.sh update" in WSL.
### Running an interactive shell
As an alternative to running the "cmd" script, you can also run "wsl.sh cmd" in WSL.
### Changing the default install location
To change this, you will need to edit the scripts as follows:
wsl.sh: line ~22 INSTALL_DIR="/path/to/install/dir"
Keep in mind that there is a long-standing bug in WSL that significantly
slows drive read/write speeds when using a physical drive as opposed to
the virtual one that Linux is installed in.
## WSL installation
Guide created by [@jfryton](https://github.com/jfryton). Thank you jfryton.
-----
Here's an easy-to-follow, step-by-step guide for installing Windows Subsystem for Linux (WSL) with Ubuntu on Windows 10/11:
### Step 1: Enable WSL
1. Press the Windows key + X and click on "Windows PowerShell (Admin)" or "Windows Terminal (Admin)" to open PowerShell or Terminal with administrator privileges.
2. In the PowerShell window, type the following command and press Enter:
```
wsl --install
```
If this command doesn't work, you can enable WSL with the following command for Windows 10:
```
wsl --set-default-version 1
```
For Windows 11, you can use:
```
wsl --set-default-version 2
```
You may be prompted to restart your computer. If so, save your work and restart.
### Step 2: Install Ubuntu
1. Open the Microsoft Store.
2. Search for "Ubuntu" in the search bar.
3. Choose the desired Ubuntu version (e.g., Ubuntu 20.04 LTS) and click "Get" or "Install" to download and install the Ubuntu app.
4. Once the installation is complete, click "Launch" or search for "Ubuntu" in the Start menu and open the app.
### Step 3: Set up Ubuntu
1. When you first launch the Ubuntu app, it will take a few minutes to set up. Be patient as it installs the necessary files and sets up your environment.
2. Once the setup is complete, you will be prompted to create a new UNIX username and password. Choose a username and password, and make sure to remember them, as you will need them for future administrative tasks within the Ubuntu environment.
### Step 4: Update and upgrade packages
1. After setting up your username and password, it's a good idea to update and upgrade your Ubuntu system. Run the following commands in the Ubuntu terminal:
```
sudo apt update
sudo apt upgrade
```
2. Enter your password when prompted. This will update the package list and upgrade any outdated packages.
Congratulations! You have now installed WSL with Ubuntu on your Windows 10/11 system. You can use the Ubuntu terminal for various tasks, like running Linux commands, installing packages, or managing files.
You can launch your WSL Ubuntu installation by selecting the Ubuntu app (like any other program installed on your computer) or typing 'ubuntu' into Powershell or Terminal.
### Step 5: Proceed with Linux instructions
1. You can now follow the Linux setup instructions. If you receive any error messages about a missing tool or package, just install them using apt:
```
sudo apt install [missing package]
```
You will probably need to install build-essential
```
sudo apt install build-essential
```
If you face any issues or need to troubleshoot, you can always refer to the official Microsoft documentation for WSL: https://docs.microsoft.com/en-us/windows/wsl/
### WSL2 performance using /mnt:
When you git clone a repository, put it inside WSL and not outside. To understand more, take a look at this [issue](https://github.com/microsoft/WSL/issues/4197#issuecomment-604592340)
### Bonus: Port Forwarding
By default, you won't be able to access the webui from another device on your local network. You will need to setup the appropriate port forwarding using the following steps:
1. First, get the IP address of the WSL by typing `wsl hostname -I`. This will output the IP address, for example `172.20.134.111`.
2. Then, use the following command (using PowerShell or Terminal with administrator privileges) to set up port forwarding, replacing `172.20.134.111` with the IP address you obtained in step 1:
```
netsh interface portproxy add v4tov4 listenaddress=0.0.0.0 listenport=7860 connectaddress=172.20.134.111 connectport=7860
```

View file

@ -1,5 +1,5 @@
'''
Downloads models from Hugging Face to models/username_modelname.
Downloads models from Hugging Face to user_data/models/username_modelname.
Example:
python download-model.py facebook/opt-1.3b
@ -175,7 +175,7 @@ class ModelDownloader:
if model_dir:
base_folder = model_dir
else:
base_folder = 'models' if not is_lora else 'loras'
base_folder = 'user_data/models' if not is_lora else 'user_data/loras'
# If the model is of type GGUF, save directly in the base_folder
if is_llamacpp:
@ -356,7 +356,7 @@ if __name__ == '__main__':
parser.add_argument('--specific-file', type=str, default=None, help='Name of the specific file to download (if not provided, downloads all).')
parser.add_argument('--exclude-pattern', type=str, default=None, help='Regex pattern to exclude files from download.')
parser.add_argument('--output', type=str, default=None, help='Save the model files to this folder.')
parser.add_argument('--model-dir', type=str, default=None, help='Save the model files to a subfolder of this folder instead of the default one (text-generation-webui/models).')
parser.add_argument('--model-dir', type=str, default=None, help='Save the model files to a subfolder of this folder instead of the default one (text-generation-webui/user_data/models).')
parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
parser.add_argument('--check', action='store_true', help='Validates the checksums of model files.')
parser.add_argument('--max-retries', type=int, default=7, help='Max retries count when get error in download time.')

View file

@ -59,4 +59,4 @@ def create_graph(lora_path, lora_name):
print(f"File 'training_graph.json' does not exist in the {lora_path}")
except ImportError:
print("matplotlib is not installed. Please install matplotlib to create PNG graphs")
print("matplotlib is not installed. Please install matplotlib to create PNG graphs")

View file

@ -175,23 +175,23 @@ def ui():
with gr.Row():
with gr.Column():
with gr.Row():
dataset = gr.Dropdown(choices=get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown'])
create_refresh_button(dataset, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'json')}, 'refresh-button')
dataset = gr.Dropdown(choices=get_datasets('user_data/training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown'])
create_refresh_button(dataset, lambda: None, lambda: {'choices': get_datasets('user_data/training/datasets', 'json')}, 'refresh-button')
with gr.Row():
eval_dataset = gr.Dropdown(choices=get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown'])
create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'json')}, 'refresh-button')
eval_dataset = gr.Dropdown(choices=get_datasets('user_data/training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown'])
create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': get_datasets('user_data/training/datasets', 'json')}, 'refresh-button')
with gr.Column():
with gr.Row():
format = gr.Dropdown(choices=get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown'])
create_refresh_button(format, lambda: None, lambda: {'choices': get_datasets('training/formats', 'json')}, 'refresh-button')
format = gr.Dropdown(choices=get_datasets('user_data/training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown'])
create_refresh_button(format, lambda: None, lambda: {'choices': get_datasets('user_data/training/formats', 'json')}, 'refresh-button')
with gr.Row():
eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.')
with gr.Tab(label="Text file"):
with gr.Row():
raw_text_file = gr.Dropdown(choices=get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The text file to use for training.', elem_classes=['slim-dropdown'])
create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': get_datasets('training/datasets', 'txt')}, 'refresh-button')
raw_text_file = gr.Dropdown(choices=get_datasets('user_data/training/datasets', 'txt'), value='None', label='Text file', info='The text file to use for training.', elem_classes=['slim-dropdown'])
create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': get_datasets('user_data/training/datasets', 'txt')}, 'refresh-button')
with gr.Row():
with gr.Column():
@ -208,7 +208,7 @@ def ui():
download_file_url = gr.Textbox(label='Download JSON or txt file to datasets (or formats) folder', value='',info='The URL of a file to download. If on github, make sure you get url of the raw file (https://raw.githubusercontent.com/...). If huggin face, make sure the url has /resolve/ in it not /blob/')
with gr.Row():
download_check_overwrite = gr.Checkbox(label='Overwrite', value=False, info='Overwrite if file exist')
download_folder = gr.Radio(label="Destination", value='training/datasets', choices=['training/datasets', 'training/formats'], interactive=True)
download_folder = gr.Radio(label="Destination", value='user_data/training/datasets', choices=['user_data/training/datasets', 'user_data/training/formats'], interactive=True)
download_button = gr.Button('Download')
download_status = gr.Textbox(label='Download Status', value='', interactive=False)
with gr.Row():
@ -235,7 +235,7 @@ def ui():
with gr.Row():
with gr.Column():
models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True)
evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.')
evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + get_datasets('user_data/training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under user_data/training/datasets.')
with gr.Row():
with gr.Column():
stride_length = gr.Slider(label='Stride', minimum=1, maximum=2048, value=512, step=1, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
@ -310,7 +310,7 @@ def ui():
if raw_text_file not in ['None', '']:
logger.info("Loading Text file...")
fullpath = clean_path('training/datasets', f'{raw_text_file}')
fullpath = clean_path('user_data/training/datasets', f'{raw_text_file}')
fullpath = Path(fullpath)
if fullpath.is_dir():
logger.info('Training path directory {}'.format(raw_text_file))
@ -324,10 +324,10 @@ def ui():
logger.info(f"Loaded training file: {file_path.name}")
else:
try:
with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
with open(clean_path('user_data/training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
raw_text = file.read().replace('\r', '')
except:
yield f"{raw_text_file}.txt doesn't seem to exsist anymore... check your training/datasets folder"
yield f"{raw_text_file}.txt doesn't seem to exsist anymore... check your user_data/training/datasets folder"
return
@ -353,7 +353,7 @@ def ui():
yield "Select format choice for dataset."
return
with open(clean_path('training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
with open(clean_path('user_data/training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
format_data: dict[str, str] = json.load(formatFile)
def generate_prompt(data_point: dict[str, str]):
@ -381,7 +381,7 @@ def ui():
return tokenize_dummy(prompt)
logger.info("Loading JSON datasets...")
data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json'))
data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{dataset}.json'))
data_keys = []
@ -456,7 +456,7 @@ def ui():
#debug_slicer.change(lambda x: non_serialized_params.update({"debug_slicer": x}), debug_slicer, None)
def update_dataset():
return gr.update(choices=get_datasets('training/datasets', 'json')), gr.update(choices=get_datasets('training/datasets', 'txt'))
return gr.update(choices=get_datasets('user_data/training/datasets', 'json')), gr.update(choices=get_datasets('user_data/training/datasets', 'txt'))
download_button.click(download_file_from_url, [download_file_url,download_check_overwrite,download_folder] , download_status).then(update_dataset,None,[dataset , raw_text_file])
@ -670,7 +670,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
if raw_text_file not in ['None', '']:
train_template["template_type"] = "raw_text"
logger.info("Loading text file...")
fullpath = clean_path('training/datasets', f'{raw_text_file}')
fullpath = clean_path('user_data/training/datasets', f'{raw_text_file}')
fullpath = Path(fullpath)
if fullpath.is_dir():
logger.info('Training path directory {}'.format(raw_text_file))
@ -683,7 +683,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
logger.info(f"Loaded training file: {file_path.name}")
else:
with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
with open(clean_path('user_data/training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
raw_text = file.read().replace('\r', '')
# FPHAM PRECISE SLICING
@ -720,7 +720,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
train_template["template_type"] = "dataset"
with open(clean_path('training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
with open(clean_path('user_data/training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
format_data: dict[str, str] = json.load(formatFile)
# == store training prompt ==
@ -742,7 +742,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
return tokenize(prompt, add_eos_token, add_bos_token)
logger.info("Loading JSON datasets...")
data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json'))
data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{dataset}.json'))
train_data = data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
print(f"BOS: {add_bos_token} EOS: {add_eos_token}")
@ -751,7 +751,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
if eval_dataset == 'None':
eval_data = None
else:
eval_data = load_dataset("json", data_files=clean_path('training/datasets', f'{eval_dataset}.json'))
eval_data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{eval_dataset}.json'))
eval_data = eval_data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
# == We MUST reload model if it went through any previous training, even failed one ==
@ -1157,11 +1157,11 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
decoded_entries.append({"value": decoded_text})
# Write the log file
Path('logs').mkdir(exist_ok=True)
with open(Path('logs/train_dataset_sample.json'), 'w') as json_file:
Path('user_data/logs').mkdir(exist_ok=True)
with open(Path('user_data/logs/train_dataset_sample.json'), 'w') as json_file:
json.dump(decoded_entries, json_file, indent=4)
logger.info("Log file 'train_dataset_sample.json' created in the 'logs' directory.")
logger.info("Log file 'train_dataset_sample.json' created in the 'user_data/logs' directory.")
except Exception as e:
logger.error(f"Failed to create log file due to error: {e}")

View file

@ -194,13 +194,13 @@ def precise_cut(text: str, overlap: bool, min_chars_cut: int, eos_to_hc: bool, c
if debug_slicer:
# Write the log file
Path('logs').mkdir(exist_ok=True)
Path('user_data/logs').mkdir(exist_ok=True)
sentencelist_dict = {index: sentence for index, sentence in enumerate(sentencelist)}
output_file = "logs/sentencelist.json"
output_file = "user_data/logs/sentencelist.json"
with open(output_file, 'w') as f:
json.dump(sentencelist_dict, f,indent=2)
print("Saved sentencelist.json in logs folder")
print("Saved sentencelist.json in user_data/logs folder")
return sentencelist
@ -281,13 +281,13 @@ def sliding_block_cut(text: str, min_chars_cut: int, eos_to_hc: bool, cutoff_len
if debug_slicer:
# Write the log file
Path('logs').mkdir(exist_ok=True)
Path('user_data/logs').mkdir(exist_ok=True)
sentencelist_dict = {index: sentence for index, sentence in enumerate(sentencelist)}
output_file = "logs/sentencelist.json"
output_file = "user_data/logs/sentencelist.json"
with open(output_file, 'w') as f:
json.dump(sentencelist_dict, f,indent=2)
print("Saved sentencelist.json in logs folder")
print("Saved sentencelist.json in user_data/logs folder")
return sentencelist

View file

@ -72,13 +72,13 @@ def generate_html():
global cards
cards = []
# Iterate through files in image folder
for file in sorted(Path("characters").glob("*")):
for file in sorted(Path("user_data/characters").glob("*")):
if file.suffix in [".json", ".yml", ".yaml"]:
character = file.stem
container_html = '<div class="character-container">'
image_html = "<div class='placeholder'></div>"
for path in [Path(f"characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]:
for path in [Path(f"user_data/characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]:
if path.exists():
image_html = f'<img src="file/{get_image_cache(path)}">'
break

View file

@ -86,6 +86,20 @@ app.add_middleware(
)
@app.middleware("http")
async def validate_host_header(request: Request, call_next):
# Be strict about only approving access to localhost by default
if not (shared.args.listen or shared.args.public_api):
host = request.headers.get("host", "").split(":")[0]
if host not in ["localhost", "127.0.0.1"]:
return JSONResponse(
status_code=400,
content={"detail": "Invalid host header"}
)
return await call_next(request)
@app.options("/", dependencies=check_key)
async def options_route():
return JSONResponse(content="OK")
@ -236,6 +250,11 @@ async def handle_moderations(request: Request):
return JSONResponse(response)
@app.get("/v1/internal/health", dependencies=check_key)
async def handle_health_check():
return JSONResponse(content={"status": "ok"})
@app.post("/v1/internal/encode", response_model=EncodeResponse, dependencies=check_key)
async def handle_token_encode(request_data: EncodeRequest):
response = token_encode(request_data.text)

View file

@ -6,7 +6,7 @@ from pydantic import BaseModel, Field
class GenerationOptions(BaseModel):
preset: str | None = Field(default=None, description="The name of a file under text-generation-webui/presets (without the .yaml extension). The sampling parameters that get overwritten by this option are the keys in the default_preset() function in modules/presets.py.")
preset: str | None = Field(default=None, description="The name of a file under text-generation-webui/user_data/presets (without the .yaml extension). The sampling parameters that get overwritten by this option are the keys in the default_preset() function in modules/presets.py.")
dynatemp_low: float = 1
dynatemp_high: float = 1
dynatemp_exponent: float = 1
@ -103,10 +103,10 @@ class ChatCompletionRequestParams(BaseModel):
mode: str = Field(default='instruct', description="Valid options: instruct, chat, chat-instruct.")
instruction_template: str | None = Field(default=None, description="An instruction template defined under text-generation-webui/instruction-templates. If not set, the correct template will be automatically obtained from the model metadata.")
instruction_template: str | None = Field(default=None, description="An instruction template defined under text-generation-webui/user_data/instruction-templates. If not set, the correct template will be automatically obtained from the model metadata.")
instruction_template_str: str | None = Field(default=None, description="A Jinja2 instruction template. If set, will take precedence over everything else.")
character: str | None = Field(default=None, description="A character defined under text-generation-webui/characters. If not set, the default \"Assistant\" character will be used.")
character: str | None = Field(default=None, description="A character defined under text-generation-webui/user_data/characters. If not set, the default \"Assistant\" character will be used.")
bot_name: str | None = Field(default=None, description="Overwrites the value set by character field.", alias="name2")
context: str | None = Field(default=None, description="Overwrites the value set by character field.")
greeting: str | None = Field(default=None, description="Overwrites the value set by character field.")

View file

@ -148,7 +148,7 @@ class ChromaCollector():
id_ = new_ids[i]
metadata = metadatas[i] if metadatas is not None else None
embedding = self.embeddings_cache.get(text)
if embedding is not None and embedding.any():
if embedding is not None and any(embedding):
existing_texts.append(text)
existing_embeddings.append(embedding)
existing_ids.append(id_)

View file

@ -31,24 +31,94 @@ function removeLastClick() {
}
function handleMorphdomUpdate(text) {
// Track closed blocks
const closedBlocks = new Set();
document.querySelectorAll(".thinking-block").forEach(block => {
const blockId = block.getAttribute("data-block-id");
// If block exists and is not open, add to closed set
if (blockId && !block.hasAttribute("open")) {
closedBlocks.add(blockId);
}
});
// Store scroll positions for any open blocks
const scrollPositions = {};
document.querySelectorAll(".thinking-block[open]").forEach(block => {
const content = block.querySelector(".thinking-content");
const blockId = block.getAttribute("data-block-id");
if (content && blockId) {
const isAtBottom = Math.abs((content.scrollHeight - content.scrollTop) - content.clientHeight) < 5;
scrollPositions[blockId] = {
position: content.scrollTop,
isAtBottom: isAtBottom
};
}
});
morphdom(
document.getElementById("chat").parentNode,
"<div class=\"prose svelte-1ybaih5\">" + text + "</div>",
{
onBeforeElUpdated: function(fromEl, toEl) {
// Preserve code highlighting
if (fromEl.tagName === "PRE" && fromEl.querySelector("code[data-highlighted]")) {
const fromCode = fromEl.querySelector("code");
const toCode = toEl.querySelector("code");
if (fromCode && toCode && fromCode.textContent === toCode.textContent) {
// If the <code> content is the same, preserve the entire <pre> element
toEl.className = fromEl.className;
toEl.innerHTML = fromEl.innerHTML;
return false; // Skip updating the <pre> element
return false;
}
}
// For thinking blocks, respect closed state
if (fromEl.classList && fromEl.classList.contains("thinking-block") &&
toEl.classList && toEl.classList.contains("thinking-block")) {
const blockId = toEl.getAttribute("data-block-id");
// If this block was closed by user, keep it closed
if (blockId && closedBlocks.has(blockId)) {
toEl.removeAttribute("open");
}
}
return !fromEl.isEqualNode(toEl);
},
onElUpdated: function(el) {
// Restore scroll positions for open thinking blocks
if (el.classList && el.classList.contains("thinking-block") && el.hasAttribute("open")) {
const blockId = el.getAttribute("data-block-id");
const content = el.querySelector(".thinking-content");
if (content && blockId && scrollPositions[blockId]) {
setTimeout(() => {
if (scrollPositions[blockId].isAtBottom) {
content.scrollTop = content.scrollHeight;
} else {
content.scrollTop = scrollPositions[blockId].position;
}
}, 0);
}
}
return !fromEl.isEqualNode(toEl); // Update only if nodes differ
}
}
);
// Add toggle listeners for new blocks
document.querySelectorAll(".thinking-block").forEach(block => {
if (!block._hasToggleListener) {
block.addEventListener("toggle", function(e) {
if (this.open) {
const content = this.querySelector(".thinking-content");
if (content) {
setTimeout(() => {
content.scrollTop = content.scrollHeight;
}, 0);
}
}
});
block._hasToggleListener = true;
}
});
}

View file

@ -395,7 +395,7 @@ let bigPictureVisible = false;
function addBigPicture() {
var imgElement = document.createElement("img");
var timestamp = new Date().getTime();
imgElement.src = "/file/cache/pfp_character.png?time=" + timestamp;
imgElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
imgElement.classList.add("bigProfilePicture");
imgElement.addEventListener("load", function () {
this.style.visibility = "visible";

View file

@ -2,6 +2,6 @@ function updateBigPicture() {
var existingElement = document.querySelector(".bigProfilePicture");
if (existingElement) {
var timestamp = new Date().getTime();
existingElement.src = "/file/cache/pfp_character.png?time=" + timestamp;
existingElement.src = "/file/user_data/cache/pfp_character.png?time=" + timestamp;
}
}

View file

@ -417,16 +417,8 @@ def generate_chat_reply(text, state, regenerate=False, _continue=False, loading_
yield history
return
show_after = html.escape(state.get("show_after")) if state.get("show_after") else None
for history in chatbot_wrapper(text, state, regenerate=regenerate, _continue=_continue, loading_message=loading_message, for_ui=for_ui):
if show_after:
after = history["visible"][-1][1].partition(show_after)[2] or "*Is thinking...*"
yield {
'internal': history['internal'],
'visible': history['visible'][:-1] + [[history['visible'][-1][0], after]]
}
else:
yield history
yield history
def character_is_loaded(state, raise_exception=False):
@ -533,9 +525,9 @@ def start_new_chat(state):
def get_history_file_path(unique_id, character, mode):
if mode == 'instruct':
p = Path(f'logs/instruct/{unique_id}.json')
p = Path(f'user_data/logs/instruct/{unique_id}.json')
else:
p = Path(f'logs/chat/{character}/{unique_id}.json')
p = Path(f'user_data/logs/chat/{character}/{unique_id}.json')
return p
@ -571,13 +563,13 @@ def rename_history(old_id, new_id, character, mode):
def get_paths(state):
if state['mode'] == 'instruct':
return Path('logs/instruct').glob('*.json')
return Path('user_data/logs/instruct').glob('*.json')
else:
character = state['character_menu']
# Handle obsolete filenames and paths
old_p = Path(f'logs/{character}_persistent.json')
new_p = Path(f'logs/persistent_{character}.json')
old_p = Path(f'user_data/logs/{character}_persistent.json')
new_p = Path(f'user_data/logs/persistent_{character}.json')
if old_p.exists():
logger.warning(f"Renaming \"{old_p}\" to \"{new_p}\"")
old_p.rename(new_p)
@ -589,7 +581,7 @@ def get_paths(state):
p.parent.mkdir(exist_ok=True)
new_p.rename(p)
return Path(f'logs/chat/{character}').glob('*.json')
return Path(f'user_data/logs/chat/{character}').glob('*.json')
def find_all_histories(state):
@ -740,7 +732,7 @@ def generate_pfp_cache(character):
if not cache_folder.exists():
cache_folder.mkdir()
for path in [Path(f"characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]:
for path in [Path(f"user_data/characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]:
if path.exists():
original_img = Image.open(path)
original_img.save(Path(f'{cache_folder}/pfp_character.png'), format='PNG')
@ -760,12 +752,12 @@ def load_character(character, name1, name2):
filepath = None
for extension in ["yml", "yaml", "json"]:
filepath = Path(f'characters/{character}.{extension}')
filepath = Path(f'user_data/characters/{character}.{extension}')
if filepath.exists():
break
if filepath is None or not filepath.exists():
logger.error(f"Could not find the character \"{character}\" inside characters/. No character has been loaded.")
logger.error(f"Could not find the character \"{character}\" inside user_data/characters. No character has been loaded.")
raise ValueError
file_contents = open(filepath, 'r', encoding='utf-8').read()
@ -804,7 +796,7 @@ def load_instruction_template(template):
if template == 'None':
return ''
for filepath in [Path(f'instruction-templates/{template}.yaml'), Path('instruction-templates/Alpaca.yaml')]:
for filepath in [Path(f'user_data/instruction-templates/{template}.yaml'), Path('user_data/instruction-templates/Alpaca.yaml')]:
if filepath.exists():
break
else:
@ -846,17 +838,17 @@ def upload_character(file, img, tavern=False):
outfile_name = name
i = 1
while Path(f'characters/{outfile_name}.yaml').exists():
while Path(f'user_data/characters/{outfile_name}.yaml').exists():
outfile_name = f'{name}_{i:03d}'
i += 1
with open(Path(f'characters/{outfile_name}.yaml'), 'w', encoding='utf-8') as f:
with open(Path(f'user_data/characters/{outfile_name}.yaml'), 'w', encoding='utf-8') as f:
f.write(yaml_data)
if img is not None:
img.save(Path(f'characters/{outfile_name}.png'))
img.save(Path(f'user_data/characters/{outfile_name}.png'))
logger.info(f'New character saved to "characters/{outfile_name}.yaml".')
logger.info(f'New character saved to "user_data/characters/{outfile_name}.yaml".')
return gr.update(value=outfile_name, choices=get_available_characters())
@ -931,9 +923,9 @@ def save_character(name, greeting, context, picture, filename):
return
data = generate_character_yaml(name, greeting, context)
filepath = Path(f'characters/{filename}.yaml')
filepath = Path(f'user_data/characters/{filename}.yaml')
save_file(filepath, data)
path_to_img = Path(f'characters/{filename}.png')
path_to_img = Path(f'user_data/characters/{filename}.png')
if picture is not None:
picture.save(path_to_img)
logger.info(f'Saved {path_to_img}.')
@ -941,9 +933,9 @@ def save_character(name, greeting, context, picture, filename):
def delete_character(name, instruct=False):
for extension in ["yml", "yaml", "json"]:
delete_file(Path(f'characters/{name}.{extension}'))
delete_file(Path(f'user_data/characters/{name}.{extension}'))
delete_file(Path(f'characters/{name}.png'))
delete_file(Path(f'user_data/characters/{name}.png'))
def jinja_template_from_old_format(params, verbose=False):
@ -1246,7 +1238,7 @@ def handle_save_template_click(instruction_template_str):
contents = generate_instruction_template_yaml(instruction_template_str)
return [
"My Template.yaml",
"instruction-templates/",
"user_data/instruction-templates/",
contents,
gr.update(visible=True)
]
@ -1255,7 +1247,7 @@ def handle_save_template_click(instruction_template_str):
def handle_delete_template_click(template):
return [
f"{template}.yaml",
"instruction-templates/",
"user_data/instruction-templates/",
gr.update(visible=False)
]

View file

@ -12,8 +12,8 @@ from modules.text_generation import encode
def load_past_evaluations():
if Path('logs/evaluations.csv').exists():
df = pd.read_csv(Path('logs/evaluations.csv'), dtype=str)
if Path('user_data/logs/evaluations.csv').exists():
df = pd.read_csv(Path('user_data/logs/evaluations.csv'), dtype=str)
df['Perplexity'] = pd.to_numeric(df['Perplexity'])
return df
else:
@ -26,7 +26,7 @@ past_evaluations = load_past_evaluations()
def save_past_evaluations(df):
global past_evaluations
past_evaluations = df
filepath = Path('logs/evaluations.csv')
filepath = Path('user_data/logs/evaluations.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(filepath, index=False)
@ -69,7 +69,7 @@ def calculate_perplexity(models, input_dataset, stride, _max_length):
data = load_dataset('ptb_text_only', 'penn_treebank', split='test')
text = " ".join(data['sentence'])
else:
with open(Path(f'training/datasets/{input_dataset}.txt'), 'r', encoding='utf-8') as f:
with open(Path(f'user_data/training/datasets/{input_dataset}.txt'), 'r', encoding='utf-8') as f:
text = f.read()
for model in models:

View file

@ -40,7 +40,7 @@ class Exllamav2Model:
config.model_dir = str(path_to_model)
config.prepare()
config.max_seq_len = shared.args.max_seq_len
config.max_seq_len = shared.args.ctx_size
config.scale_pos_emb = shared.args.compress_pos_emb
config.scale_alpha_value = shared.args.alpha_value
config.no_flash_attn = shared.args.no_flash_attn
@ -85,7 +85,44 @@ class Exllamav2Model:
model.load_autosplit(cache)
tokenizer = ExLlamaV2Tokenizer(config)
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
# Initialize draft model for speculative decoding
draft_model = None
draft_cache = None
if shared.args.model_draft and shared.args.model_draft.lower() not in ["none", ""]:
logger.info(f"Loading draft model for speculative decoding: {shared.args.model_draft}")
# Find the draft model path
draft_path = Path(shared.args.model_draft)
if not draft_path.exists():
draft_path = Path(f'{shared.args.model_dir}') / Path(shared.args.model_draft)
draft_config = ExLlamaV2Config()
draft_config.model_dir = str(draft_path)
draft_config.prepare()
draft_config.arch_compat_overrides()
# Set context size for draft model
if shared.args.ctx_size_draft > 0:
draft_config.max_seq_len = shared.args.ctx_size_draft
else:
draft_config.max_seq_len = config.max_seq_len
draft_model = ExLlamaV2(draft_config)
draft_cache = cache_type(draft_model, lazy=True)
draft_model.load_autosplit(draft_cache)
logger.info(f"Draft model loaded successfully with max_draft={shared.args.draft_max}")
generator = ExLlamaV2StreamingGenerator(
model,
cache,
tokenizer,
draft_model=draft_model,
draft_cache=draft_cache,
num_speculative_tokens=shared.args.draft_max if draft_model is not None else 0
)
result = self()
result.model = model
@ -93,6 +130,8 @@ class Exllamav2Model:
result.tokenizer = tokenizer
result.generator = generator
result.loras = None
result.draft_model = draft_model
result.draft_cache = draft_cache
return result, result
def encode(self, string, **kwargs):
@ -179,6 +218,10 @@ class Exllamav2Model:
else:
max_new_tokens = state['max_new_tokens']
# Reset speculative decoding stats if using a draft model
if hasattr(self, 'draft_model') and self.draft_model is not None:
self.generator.reset_sd_stats()
self.generator.begin_stream(ids, settings, loras=self.loras)
decoded_text = ''
@ -190,6 +233,11 @@ class Exllamav2Model:
decoded_text += chunk
yield decoded_text
# Log speculative decoding stats if using draft model
if hasattr(self, 'draft_model') and self.draft_model is not None:
efficiency, accuracy, total_tokens, total_draft_tokens, accepted_draft_tokens = self.generator.get_sd_stats()
logger.info(f"Speculative decoding: accepted={accepted_draft_tokens}/{total_draft_tokens} tokens")
def generate(self, prompt, state):
output = ''
for output in self.generate_with_streaming(prompt, state):

View file

@ -192,7 +192,7 @@ class Exllamav2HF(PreTrainedModel, GenerationMixin):
config.model_dir = str(pretrained_model_name_or_path)
config.prepare()
config.max_seq_len = shared.args.max_seq_len
config.max_seq_len = shared.args.ctx_size
config.scale_pos_emb = shared.args.compress_pos_emb
config.scale_alpha_value = shared.args.alpha_value
config.no_flash_attn = shared.args.no_flash_attn

View file

@ -5,6 +5,7 @@ from typing import Any, Dict, Optional, Union
import torch
from exllamav3 import Cache, Config, Model
from exllamav3.cache import CacheLayer_fp16, CacheLayer_quant
from torch.nn import CrossEntropyLoss
from transformers import (
GenerationConfig,
@ -33,13 +34,39 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
self.ex_model = Model.from_config(config)
# Calculate the closest multiple of 256 at or above the chosen value
max_tokens = shared.args.max_seq_len
max_tokens = shared.args.ctx_size
if max_tokens % 256 != 0:
adjusted_tokens = ((max_tokens // 256) + 1) * 256
logger.warning(f"max_num_tokens must be a multiple of 256. Adjusting from {max_tokens} to {adjusted_tokens}")
max_tokens = adjusted_tokens
self.ex_cache = Cache(self.ex_model, max_num_tokens=max_tokens)
# Parse cache type
cache_type = shared.args.cache_type.lower()
cache_kwargs = {}
if cache_type == 'fp16':
layer_type = CacheLayer_fp16
elif cache_type.startswith('q'):
layer_type = CacheLayer_quant
if '_' in cache_type:
# Different bits for k and v (e.g., q4_q8)
k_part, v_part = cache_type.split('_')
k_bits = int(k_part[1:])
v_bits = int(v_part[1:])
else:
# Same bits for k and v (e.g., q4)
k_bits = v_bits = int(cache_type[1:])
# Validate bit ranges
if not (2 <= k_bits <= 8 and 2 <= v_bits <= 8):
logger.warning(f"Invalid quantization bits: k_bits={k_bits}, v_bits={v_bits}. Must be between 2 and 8. Falling back to fp16.")
layer_type = CacheLayer_fp16
else:
cache_kwargs = {'k_bits': k_bits, 'v_bits': v_bits}
else:
logger.warning(f"Unrecognized cache type: {cache_type}. Falling back to fp16.")
layer_type = CacheLayer_fp16
self.ex_cache = Cache(self.ex_model, max_num_tokens=max_tokens, layer_type=layer_type, **cache_kwargs)
# Create load parameters dictionary
load_params = {'progressbar': True}

View file

@ -1,5 +1,6 @@
'''
Copied from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14184
Most of the code here was adapted from:
https://github.com/AUTOMATIC1111/stable-diffusion-webui/pull/14184
'''
import inspect
@ -7,6 +8,30 @@ import warnings
from functools import wraps
import gradio as gr
import gradio.routes
import gradio.utils
from starlette.middleware.trustedhost import TrustedHostMiddleware
from modules import shared
orig_create_app = gradio.routes.App.create_app
# Be strict about only approving access to localhost by default
def create_app_with_trustedhost(*args, **kwargs):
app = orig_create_app(*args, **kwargs)
if not (shared.args.listen or shared.args.share):
app.add_middleware(
TrustedHostMiddleware,
allowed_hosts=["localhost", "127.0.0.1"]
)
return app
gradio.routes.App.create_app = create_app_with_trustedhost
gradio.utils.launch_counter = lambda: None
class GradioDeprecationWarning(DeprecationWarning):

View file

@ -1,3 +1,4 @@
import datetime
import functools
import html
import os
@ -106,8 +107,87 @@ def replace_blockquote(m):
return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '')
def extract_thinking_block(string):
"""Extract thinking blocks from the beginning of a string."""
if not string:
return None, string
THINK_START_TAG = "&lt;think&gt;"
THINK_END_TAG = "&lt;/think&gt;"
# Look for opening tag
start_pos = string.lstrip().find(THINK_START_TAG)
if start_pos == -1:
return None, string
# Adjust start position to account for any leading whitespace
start_pos = string.find(THINK_START_TAG)
# Find the content after the opening tag
content_start = start_pos + len(THINK_START_TAG)
# Look for closing tag
end_pos = string.find(THINK_END_TAG, content_start)
if end_pos != -1:
# Both tags found - extract content between them
thinking_content = string[content_start:end_pos]
remaining_content = string[end_pos + len(THINK_END_TAG):]
return thinking_content, remaining_content
else:
# Only opening tag found - everything else is thinking content
thinking_content = string[content_start:]
return thinking_content, ""
@functools.lru_cache(maxsize=None)
def convert_to_markdown(string):
def convert_to_markdown(string, message_id=None):
if not string:
return ""
# Use a default message ID if none provided
if message_id is None:
message_id = "unknown"
# Extract thinking block if present
thinking_content, remaining_content = extract_thinking_block(string)
# Process the main content
html_output = process_markdown_content(remaining_content)
# If thinking content was found, process it using the same function
if thinking_content is not None:
thinking_html = process_markdown_content(thinking_content)
# Generate unique ID for the thinking block
block_id = f"thinking-{message_id}-0"
# Check if thinking is complete or still in progress
is_streaming = not remaining_content
title_text = "Thinking..." if is_streaming else "Thought"
thinking_block = f'''
<details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}" open>
<summary class="thinking-header">
<svg class="thinking-icon" width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
<path d="M8 1.33334C4.31868 1.33334 1.33334 4.31868 1.33334 8.00001C1.33334 11.6813 4.31868 14.6667 8 14.6667C11.6813 14.6667 14.6667 11.6813 14.6667 8.00001C14.6667 4.31868 11.6813 1.33334 8 1.33334Z" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
<path d="M8 10.6667V8.00001" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
<path d="M8 5.33334H8.00667" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
</svg>
<span class="thinking-title">{title_text}</span>
</summary>
<div class="thinking-content pretty_scrollbar">{thinking_html}</div>
</details>
'''
# Prepend the thinking block to the message HTML
html_output = thinking_block + html_output
return html_output
def process_markdown_content(string):
"""Process a string through the markdown conversion pipeline."""
if not string:
return ""
@ -208,15 +288,15 @@ def convert_to_markdown(string):
return html_output
def convert_to_markdown_wrapped(string, use_cache=True):
def convert_to_markdown_wrapped(string, message_id=None, use_cache=True):
'''
Used to avoid caching convert_to_markdown calls during streaming.
'''
if use_cache:
return convert_to_markdown(string)
return convert_to_markdown(string, message_id=message_id)
return convert_to_markdown.__wrapped__(string)
return convert_to_markdown.__wrapped__(string, message_id=message_id)
def generate_basic_html(string):
@ -272,7 +352,7 @@ def generate_instruct_html(history):
for i in range(len(history['visible'])):
row_visible = history['visible'][i]
row_internal = history['internal'][i]
converted_visible = [convert_to_markdown_wrapped(entry, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
if converted_visible[0]: # Don't display empty user messages
output += (
@ -307,19 +387,19 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
# We use ?character and ?time.time() to force the browser to reset caches
img_bot = (
f'<img src="file/cache/pfp_character_thumb.png?{character}" class="pfp_character">'
if Path("cache/pfp_character_thumb.png").exists() else ''
f'<img src="file/user_data/cache/pfp_character_thumb.png?{character}" class="pfp_character">'
if Path("user_data/cache/pfp_character_thumb.png").exists() else ''
)
img_me = (
f'<img src="file/cache/pfp_me.png?{time.time() if reset_cache else ""}">'
if Path("cache/pfp_me.png").exists() else ''
f'<img src="file/user_data/cache/pfp_me.png?{time.time() if reset_cache else ""}">'
if Path("user_data/cache/pfp_me.png").exists() else ''
)
for i in range(len(history['visible'])):
row_visible = history['visible'][i]
row_internal = history['internal'][i]
converted_visible = [convert_to_markdown_wrapped(entry, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
if converted_visible[0]: # Don't display empty user messages
output += (
@ -359,7 +439,7 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
for i in range(len(history['visible'])):
row_visible = history['visible'][i]
row_internal = history['internal'][i]
converted_visible = [convert_to_markdown_wrapped(entry, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
if converted_visible[0]: # Don't display empty user messages
output += (
@ -389,8 +469,21 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
return output
def time_greeting():
current_hour = datetime.datetime.now().hour
if 5 <= current_hour < 12:
return "Good morning!"
elif 12 <= current_hour < 18:
return "Good afternoon!"
else:
return "Good evening!"
def chat_html_wrapper(history, name1, name2, mode, style, character, reset_cache=False):
if mode == 'instruct':
if len(history['visible']) == 0:
greeting = f"<div class=\"welcome-greeting\">{time_greeting()} How can I help you today?</div>"
result = f'<div class="chat" id="chat">{greeting}</div>'
elif mode == 'instruct':
result = generate_instruct_html(history)
elif style == 'wpp':
result = generate_chat_html(history, name1, name2)

View file

@ -1,11 +1,13 @@
import json
import os
import pprint
import re
import socket
import subprocess
import sys
import threading
import time
from pathlib import Path
import llama_cpp_binaries
import requests
@ -251,7 +253,7 @@ class LlamaServer:
cmd = [
self.server_path,
"--model", self.model_path,
"--ctx-size", str(shared.args.n_ctx),
"--ctx-size", str(shared.args.ctx_size),
"--n-gpu-layers", str(shared.args.n_gpu_layers),
"--batch-size", str(shared.args.batch_size),
"--port", str(self.port),
@ -281,6 +283,41 @@ class LlamaServer:
cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
if shared.args.rope_freq_base > 0:
cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)]
if shared.args.model_draft not in [None, 'None']:
path = Path(shared.args.model_draft)
if not path.exists():
path = Path(f'{shared.args.model_dir}/{shared.args.model_draft}')
if path.is_file():
model_file = path
else:
model_file = sorted(Path(f'{shared.args.model_dir}/{shared.args.model_draft}').glob('*.gguf'))[0]
cmd += ["--model-draft", model_file]
if shared.args.draft_max > 0:
cmd += ["--draft-max", str(shared.args.draft_max)]
if shared.args.gpu_layers_draft > 0:
cmd += ["--gpu-layers-draft", str(shared.args.gpu_layers_draft)]
if shared.args.device_draft:
cmd += ["--device-draft", shared.args.device_draft]
if shared.args.ctx_size_draft > 0:
cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)]
if shared.args.streaming_llm:
cmd += ["--cache-reuse", "1"]
if shared.args.extra_flags:
# Clean up the input
extra_flags = shared.args.extra_flags.strip()
if extra_flags.startswith('"') and extra_flags.endswith('"'):
extra_flags = extra_flags[1:-1].strip()
elif extra_flags.startswith("'") and extra_flags.endswith("'"):
extra_flags = extra_flags[1:-1].strip()
for flag_item in extra_flags.split(','):
if '=' in flag_item:
flag, value = flag_item.split('=', 1)
cmd += [f"--{flag}", value]
else:
cmd.append(f"--{flag_item}")
env = os.environ.copy()
if os.name == 'posix':
@ -299,17 +336,7 @@ class LlamaServer:
env=env
)
def filter_stderr(process_stderr):
try:
for line in iter(process_stderr.readline, ''):
if not line.startswith(('srv ', 'slot ')) and 'log_server_r: request: GET /health' not in line:
sys.stderr.write(line)
sys.stderr.flush()
except (ValueError, IOError):
# Handle pipe closed exceptions
pass
threading.Thread(target=filter_stderr, args=(self.process.stderr,), daemon=True).start()
threading.Thread(target=filter_stderr_with_progress, args=(self.process.stderr,), daemon=True).start()
# Wait for server to be healthy
health_url = f"http://127.0.0.1:{self.port}/health"
@ -360,3 +387,18 @@ class LlamaServer:
self.process.kill()
self.process = None
def filter_stderr_with_progress(process_stderr):
progress_pattern = re.compile(r'slot update_slots: id.*progress = (\d+\.\d+)')
try:
for line in iter(process_stderr.readline, ''):
progress_match = progress_pattern.search(line)
if progress_match:
sys.stderr.write(line)
sys.stderr.flush()
elif not line.startswith(('srv ', 'slot ')) and 'log_server_r: request: GET /health' not in line:
sys.stderr.write(line)
sys.stderr.flush()
except (ValueError, IOError):
pass

View file

@ -9,9 +9,11 @@ loaders_and_params = OrderedDict({
'threads',
'threads_batch',
'batch_size',
'n_ctx',
'ctx_size',
'cache_type',
'tensor_split',
'extra_flags',
'streaming_llm',
'rope_freq_base',
'compress_pos_emb',
'flash_attn',
@ -20,6 +22,12 @@ loaders_and_params = OrderedDict({
'no_mmap',
'mlock',
'numa',
'model_draft',
'draft_max',
'gpu_layers_draft',
'device_draft',
'ctx_size_draft',
'speculative_decoding_accordion',
],
'Transformers': [
'gpu_split',
@ -41,14 +49,15 @@ loaders_and_params = OrderedDict({
'no_use_fast',
],
'ExLlamav3_HF': [
'max_seq_len',
'ctx_size',
'cache_type',
'gpu_split',
'cfg_cache',
'trust_remote_code',
'no_use_fast',
],
'ExLlamav2_HF': [
'max_seq_len',
'ctx_size',
'cache_type',
'gpu_split',
'alpha_value',
@ -64,7 +73,7 @@ loaders_and_params = OrderedDict({
'no_use_fast',
],
'ExLlamav2': [
'max_seq_len',
'ctx_size',
'cache_type',
'gpu_split',
'alpha_value',
@ -76,6 +85,10 @@ loaders_and_params = OrderedDict({
'no_xformers',
'no_sdpa',
'exllamav2_info',
'model_draft',
'draft_max',
'ctx_size_draft',
'speculative_decoding_accordion',
],
'HQQ': [
'hqq_backend',
@ -83,7 +96,7 @@ loaders_and_params = OrderedDict({
'no_use_fast',
],
'TensorRT-LLM': [
'max_seq_len',
'ctx_size',
'cpp_runner',
'tensorrt_llm_info',
]

View file

@ -52,10 +52,8 @@ def load_model(model_name, loader=None):
tokenizer = load_tokenizer(model_name)
shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt'):
shared.settings['truncation_length'] = shared.args.max_seq_len
elif loader == 'llama.cpp':
shared.settings['truncation_length'] = shared.args.n_ctx
if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt') or loader == 'llama.cpp':
shared.settings['truncation_length'] = shared.args.ctx_size
logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
logger.info(f"LOADER: \"{loader}\"")

View file

@ -11,8 +11,7 @@ def get_fallback_settings():
return {
'bf16': False,
'use_eager_attention': False,
'max_seq_len': 2048,
'n_ctx': 2048,
'ctx_size': 2048,
'rope_freq_base': 0,
'compress_pos_emb': 1,
'alpha_value': 1,
@ -26,7 +25,7 @@ def get_fallback_settings():
def get_model_metadata(model):
model_settings = {}
# Get settings from models/config.yaml and models/config-user.yaml
# Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
settings = shared.model_config
for pat in settings:
if re.match(pat.lower(), Path(model).name.lower()):
@ -59,7 +58,7 @@ def get_model_metadata(model):
for k in metadata:
if k.endswith('context_length'):
model_settings['n_ctx'] = min(metadata[k], 8192)
model_settings['ctx_size'] = min(metadata[k], 8192)
model_settings['truncation_length_info'] = metadata[k]
elif k.endswith('rope.freq_base'):
model_settings['rope_freq_base'] = metadata[k]
@ -97,7 +96,7 @@ def get_model_metadata(model):
if k in metadata:
model_settings['truncation_length'] = metadata[k]
model_settings['truncation_length_info'] = metadata[k]
model_settings['max_seq_len'] = min(metadata[k], 8192)
model_settings['ctx_size'] = min(metadata[k], 8192)
if 'rope_theta' in metadata:
model_settings['rope_freq_base'] = metadata['rope_theta']
@ -145,7 +144,7 @@ def get_model_metadata(model):
if 'rope_freq_base' in model_settings and model_settings['rope_freq_base'] == 10000:
model_settings.pop('rope_freq_base')
# Apply user settings from models/config-user.yaml
# Apply user settings from user_data/models/config-user.yaml
settings = shared.user_config
for pat in settings:
if re.match(pat.lower(), Path(model).name.lower()):
@ -224,7 +223,7 @@ def apply_model_settings_to_state(model, state):
def save_model_settings(model, state):
'''
Save the settings for this model to models/config-user.yaml
Save the settings for this model to user_data/models/config-user.yaml
'''
if model == 'None':
yield ("Not saving the settings because no model is selected in the menu.")

View file

@ -1,9 +0,0 @@
from pathlib import Path
from modules.logging_colors import logger
if Path('../webui.py').exists():
logger.warning('\nIt looks like you are running an outdated version of '
'the one-click-installers.\n'
'Please migrate your installation following the instructions here:\n'
'https://github.com/oobabooga/text-generation-webui/wiki/Migrating-an-old-one%E2%80%90click-install')

View file

@ -58,7 +58,7 @@ def presets_params():
def load_preset(name, verbose=False):
generate_params = default_preset()
if name not in ['None', None, '']:
path = Path(f'presets/{name}.yaml')
path = Path(f'user_data/presets/{name}.yaml')
if path.exists():
with open(path, 'r') as infile:
preset = yaml.safe_load(infile)

View file

@ -7,7 +7,7 @@ def load_prompt(fname):
if fname in ['None', '']:
return ''
else:
file_path = Path(f'prompts/{fname}.txt')
file_path = Path(f'user_data/prompts/{fname}.txt')
if not file_path.exists():
return ''

View file

@ -1,6 +1,7 @@
import argparse
import copy
import os
import shlex
import sys
from collections import OrderedDict
from pathlib import Path
@ -31,7 +32,7 @@ need_restart = False
settings = {
'show_controls': True,
'start_with': '',
'mode': 'chat-instruct',
'mode': 'instruct',
'chat_style': 'cai-chat',
'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
'prompt-default': 'QA',
@ -57,7 +58,6 @@ settings = {
'seed': -1,
'custom_stopping_strings': '',
'custom_token_bans': '',
'show_after': '',
'negative_prompt': '',
'autoload_model': False,
'dark_theme': True,
@ -77,10 +77,10 @@ group.add_argument('--multi-user', action='store_true', help='Multi-user mode. C
group.add_argument('--character', type=str, help='The name of the character to load in chat mode by default.')
group.add_argument('--model', type=str, help='Name of the model to load by default.')
group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.')
group.add_argument('--model-dir', type=str, default='models/', help='Path to directory with all the models.')
group.add_argument('--lora-dir', type=str, default='loras/', help='Path to directory with all the loras.')
group.add_argument('--model-dir', type=str, default='user_data/models', help='Path to directory with all the models.')
group.add_argument('--lora-dir', type=str, default='user_data/loras', help='Path to directory with all the loras.')
group.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.')
group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See user_data/settings-template.yaml for an example. If you create a file called user_data/settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
group.add_argument('--idle-timeout', type=int, default=0, help='Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.')
@ -94,7 +94,7 @@ group = parser.add_argument_group('Transformers/Accelerate')
group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
group.add_argument('--cpu-memory', type=float, default=0, help='Maximum CPU memory in GiB. Use this for CPU offloading.')
group.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
group.add_argument('--disk-cache-dir', type=str, default='cache', help='Directory to save the disk cache to. Defaults to "cache".')
group.add_argument('--disk-cache-dir', type=str, default='user_data/cache', help='Directory to save the disk cache to. Defaults to "user_data/cache".')
group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).')
group.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.')
group.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces VRAM usage slightly, but it comes at a performance cost.')
@ -115,10 +115,9 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for
# llama.cpp
group = parser.add_argument_group('llama.cpp')
group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.')
group.add_argument('--n_ctx', type=int, default=8192, help='Size of the prompt context.')
group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
group.add_argument('--batch-size', type=int, default=2048, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
@ -126,17 +125,31 @@ group.add_argument('--tensor-split', type=str, default=None, help='Split the mod
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"')
group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
# Cache
group = parser.add_argument_group('Context and cache management')
group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
# Speculative decoding
group = parser.add_argument_group('Speculative decoding')
group.add_argument('--model-draft', type=str, default=None, help='Path to the draft model for speculative decoding.')
group.add_argument('--draft-max', type=int, default=4, help='Number of tokens to draft for speculative decoding.')
group.add_argument('--gpu-layers-draft', type=int, default=256, help='Number of layers to offload to the GPU for the draft model.')
group.add_argument('--device-draft', type=str, default=None, help='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
group.add_argument('--ctx-size-draft', type=int, default=0, help='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
# ExLlamaV2
group = parser.add_argument_group('ExLlamaV2')
group.add_argument('--gpu-split', type=str, help='Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.')
group.add_argument('--autosplit', action='store_true', help='Autosplit the model tensors across the available GPUs. This causes --gpu-split to be ignored.')
group.add_argument('--max_seq_len', type=int, default=8192, help='Maximum sequence length.')
group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.')
group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.')
group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.')
group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.')
group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
# HQQ
@ -192,12 +205,36 @@ group.add_argument('--nowebui', action='store_true', help='Do not launch the Gra
# Deprecated parameters
group = parser.add_argument_group('Deprecated')
# Handle CMD_FLAGS.txt
cmd_flags_path = Path(__file__).parent.parent / "user_data" / "CMD_FLAGS.txt"
if cmd_flags_path.exists():
with cmd_flags_path.open('r', encoding='utf-8') as f:
cmd_flags = ' '.join(
line.strip().rstrip('\\').strip()
for line in f
if line.strip().rstrip('\\').strip() and not line.strip().startswith('#')
)
if cmd_flags:
# Command-line takes precedence over CMD_FLAGS.txt
sys.argv = [sys.argv[0]] + shlex.split(cmd_flags) + sys.argv[1:]
args = parser.parse_args()
args_defaults = parser.parse_args([])
# Create a mapping of all argument aliases to their canonical names
alias_to_dest = {}
for action in parser._actions:
for opt in action.option_strings:
alias_to_dest[opt.lstrip('-').replace('-', '_')] = action.dest
provided_arguments = []
for arg in sys.argv[1:]:
arg = arg.lstrip('-').replace('-', '_')
if hasattr(args, arg):
if arg in alias_to_dest:
provided_arguments.append(alias_to_dest[arg])
elif hasattr(args, arg):
provided_arguments.append(arg)

View file

@ -1,15 +1,15 @@
from pathlib import Path
import tensorrt_llm
import torch
from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
import tensorrt_llm
from modules import shared
from modules.logging_colors import logger
from modules.text_generation import (
get_max_prompt_length,
get_reply_from_output_ids
)
from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
class TensorRTLLMModel:
@ -35,7 +35,7 @@ class TensorRTLLMModel:
logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"")
runner_kwargs.update(
max_batch_size=1,
max_input_len=shared.args.max_seq_len - 512,
max_input_len=shared.args.ctx_size - 512,
max_output_len=512,
max_beam_width=1,
max_attention_window_size=None,

View file

@ -264,6 +264,11 @@ def apply_stopping_strings(reply, all_stop_strings):
def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
import torch
if torch.cuda.is_available():
torch.cuda.synchronize()
reply = decode(output_ids[starting_from:], state['skip_special_tokens'] if state else True)
# Handle tokenizers that do not add the leading space for the first token

View file

@ -52,7 +52,7 @@ def create_ui():
with gr.Column():
always_override = gr.Checkbox(label='Override Existing Files', value=False, info='If the name is the same, checking will replace the existing file, and unchecking will load and continue from it (the rank must be the same).', elem_classes=['no-background'])
with gr.Accordion(label='Target Modules', open=False):
with gr.Accordion(label='Target Modules', open=False, elem_classes='tgw-accordion'):
gr.Markdown("Selects which modules to target in training. Targeting more modules is closer to a full fine-tune at the cost of increased VRAM requirements and adapter size.\nNOTE: Only works for model_id='llama', other types will retain default training behavior and not use these settings.")
with gr.Row():
with gr.Column():
@ -86,7 +86,7 @@ def create_ui():
with gr.Row():
lr_scheduler_type = gr.Dropdown(label='LR Scheduler', value='linear', choices=['linear', 'constant', 'constant_with_warmup', 'cosine', 'cosine_with_restarts', 'polynomial', 'inverse_sqrt'], info='Learning rate scheduler - defines how the learning rate changes over time. "Constant" means never change, "linear" means to go in a straight line from the learning rate down to 0, cosine follows a curve, etc.', elem_classes=['slim-dropdown'])
with gr.Accordion(label='Advanced Options', open=False):
with gr.Accordion(label='Advanced Options', open=False, elem_classes='tgw-accordion'):
with gr.Row():
with gr.Column():
lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers. This can help reduce overfitting. Most users should leave at default.')
@ -106,23 +106,23 @@ def create_ui():
with gr.Column():
with gr.Tab(label='Formatted Dataset'):
with gr.Row():
format = gr.Dropdown(choices=utils.get_datasets('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown'], interactive=not mu)
ui.create_refresh_button(format, lambda: None, lambda: {'choices': utils.get_datasets('training/formats', 'json')}, 'refresh-button', interactive=not mu)
format = gr.Dropdown(choices=utils.get_datasets('user_data/training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.', elem_classes=['slim-dropdown'], interactive=not mu)
ui.create_refresh_button(format, lambda: None, lambda: {'choices': utils.get_datasets('user_data/training/formats', 'json')}, 'refresh-button', interactive=not mu)
with gr.Row():
dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu)
ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button', interactive=not mu)
dataset = gr.Dropdown(choices=utils.get_datasets('user_data/training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu)
ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': utils.get_datasets('user_data/training/datasets', 'json')}, 'refresh-button', interactive=not mu)
with gr.Row():
eval_dataset = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown'], interactive=not mu)
ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'json')}, 'refresh-button', interactive=not mu)
eval_dataset = gr.Dropdown(choices=utils.get_datasets('user_data/training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.', elem_classes=['slim-dropdown'], interactive=not mu)
ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': utils.get_datasets('user_data/training/datasets', 'json')}, 'refresh-button', interactive=not mu)
eval_steps = gr.Number(label='Evaluate every n steps', value=100, info='If an evaluation dataset is given, test it every time this many steps pass.')
with gr.Tab(label="Raw text file"):
with gr.Row():
raw_text_file = gr.Dropdown(choices=utils.get_datasets('training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu)
ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': utils.get_datasets('training/datasets', 'txt')}, 'refresh-button', interactive=not mu)
raw_text_file = gr.Dropdown(choices=utils.get_datasets('user_data/training/datasets', 'txt'), value='None', label='Text file', info='The raw text file to use for training.', elem_classes=['slim-dropdown'], interactive=not mu)
ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': utils.get_datasets('user_data/training/datasets', 'txt')}, 'refresh-button', interactive=not mu)
with gr.Row():
with gr.Column():
@ -143,7 +143,7 @@ def create_ui():
with gr.Row():
with gr.Column():
models = gr.Dropdown(utils.get_available_models(), label='Models', multiselect=True, interactive=not mu)
evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + utils.get_datasets('training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under training/datasets.', interactive=not mu)
evaluate_text_file = gr.Dropdown(choices=['wikitext', 'ptb', 'ptb_new'] + utils.get_datasets('user_data/training/datasets', 'txt')[1:], value='wikitext', label='Input dataset', info='The raw text file on which the model will be evaluated. The first options are automatically downloaded: wikitext, ptb, and ptb_new. The next options are your local text files under user_data/training/datasets.', interactive=not mu)
with gr.Row():
with gr.Column():
stride_length = gr.Slider(label='Stride', minimum=0, maximum=32768, value=512, step=256, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.')
@ -402,7 +402,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
if raw_text_file not in ['None', '']:
train_template["template_type"] = "raw_text"
logger.info("Loading raw text file dataset")
fullpath = clean_path('training/datasets', f'{raw_text_file}')
fullpath = clean_path('user_data/training/datasets', f'{raw_text_file}')
fullpath = Path(fullpath)
if fullpath.is_dir():
logger.info('Training path directory {}'.format(raw_text_file))
@ -415,7 +415,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
logger.info(f"Loaded training file: {file_path.name}")
else:
with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
with open(clean_path('user_data/training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file:
raw_text = file.read().replace('\r', '')
cut_string = hard_cut_string.replace('\\n', '\n')
@ -460,7 +460,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
train_template["template_type"] = "dataset"
with open(clean_path('training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
with open(clean_path('user_data/training/formats', f'{format}.json'), 'r', encoding='utf-8-sig') as formatFile:
format_data: dict[str, str] = json.load(formatFile)
# == store training prompt ==
@ -482,13 +482,13 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
return tokenize(prompt, add_eos_token)
logger.info("Loading JSON datasets")
data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json'))
data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{dataset}.json'))
train_data = data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
if eval_dataset == 'None':
eval_data = None
else:
eval_data = load_dataset("json", data_files=clean_path('training/datasets', f'{eval_dataset}.json'))
eval_data = load_dataset("json", data_files=clean_path('user_data/training/datasets', f'{eval_dataset}.json'))
eval_data = eval_data['train'].map(generate_and_tokenize_prompt, new_fingerprint='%030x' % random.randrange(16**30))
# == We MUST reload model if it went through any previous training, even failed one ==
@ -676,11 +676,11 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
decoded_entries.append({"value": decoded_text})
# Write the log file
Path('logs').mkdir(exist_ok=True)
with open(Path('logs/train_dataset_sample.json'), 'w') as json_file:
Path('user_data/logs').mkdir(exist_ok=True)
with open(Path('user_data/logs/train_dataset_sample.json'), 'w') as json_file:
json.dump(decoded_entries, json_file, indent=4)
logger.info("Log file 'train_dataset_sample.json' created in the 'logs' directory.")
logger.info("Log file 'train_dataset_sample.json' created in the 'user_data/logs' directory.")
except Exception as e:
logger.error(f"Failed to create log file due to error: {e}")

View file

@ -249,7 +249,7 @@ def load_model_HF(model_name):
)
if shared.args.disk:
params['offload_folder'] = shared.args.disk_cache_dir
params['offload_folder'] = str(Path(shared.args.disk_cache_dir))
if shared.args.compress_pos_emb > 1:
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}

View file

@ -94,7 +94,7 @@ if not shared.args.old_colors:
input_radius='0.375rem',
)
if Path("notification.mp3").exists():
if Path("user_data/notification.mp3").exists():
audio_notification_js = "document.querySelector('#audio_notification audio')?.play();"
else:
audio_notification_js = ""
@ -110,10 +110,10 @@ def list_model_elements():
'threads_batch',
'batch_size',
'hqq_backend',
'n_ctx',
'max_seq_len',
'ctx_size',
'cache_type',
'tensor_split',
'extra_flags',
'gpu_split',
'alpha_value',
'rope_freq_base',
@ -145,6 +145,11 @@ def list_model_elements():
'cpp_runner',
'trust_remote_code',
'no_use_fast',
'model_draft',
'draft_max',
'gpu_layers_draft',
'device_draft',
'ctx_size_draft',
]
return elements
@ -201,7 +206,6 @@ def list_interface_input_elements():
'sampler_priority',
'custom_stopping_strings',
'custom_token_bans',
'show_after',
'negative_prompt',
'dry_sequence_breakers',
'grammar_string',
@ -262,7 +266,7 @@ def apply_interface_values(state, use_persistent=False):
if 'textbox-default' in state and 'prompt_menu-default' in state:
state.pop('prompt_menu-default')
if 'textbox-notebook' and 'prompt_menu-notebook' in state:
if 'textbox-notebook' in state and 'prompt_menu-notebook' in state:
state.pop('prompt_menu-notebook')
elements = list_interface_input_elements()

View file

@ -88,7 +88,7 @@ def create_ui():
shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
with gr.Row():
shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
with gr.Row():
shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')
@ -146,7 +146,7 @@ def create_chat_settings_ui():
with gr.Column(scale=1):
shared.gradio['character_picture'] = gr.Image(label='Character picture', type='pil', interactive=not mu)
shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('cache/pfp_me.png')) if Path('cache/pfp_me.png').exists() else None, interactive=not mu)
shared.gradio['your_picture'] = gr.Image(label='Your picture', type='pil', value=Image.open(Path('user_data/cache/pfp_me.png')) if Path('user_data/cache/pfp_me.png').exists() else None, interactive=not mu)
with gr.Tab('Instruction template'):
with gr.Row():

View file

@ -102,7 +102,7 @@ def handle_save_prompt(text):
return [
text,
utils.current_time() + ".txt",
"prompts/",
"user_data/prompts/",
gr.update(visible=True)
]
@ -110,6 +110,6 @@ def handle_save_prompt(text):
def handle_delete_prompt(prompt):
return [
prompt + ".txt",
"prompts/",
"user_data/prompts/",
gr.update(visible=True)
]

View file

@ -28,7 +28,7 @@ def create_ui():
# Character saver/deleter
with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['character_saver']:
shared.gradio['save_character_filename'] = gr.Textbox(lines=1, label='File name', info='The character will be saved to your characters/ folder with this base filename.')
shared.gradio['save_character_filename'] = gr.Textbox(lines=1, label='File name', info='The character will be saved to your user_data/characters folder with this base filename.')
with gr.Row():
shared.gradio['save_character_cancel'] = gr.Button('Cancel', elem_classes="small-button")
shared.gradio['save_character_confirm'] = gr.Button('Save', elem_classes="small-button", variant='primary', interactive=not mu)
@ -41,7 +41,7 @@ def create_ui():
# Preset saver
with gr.Group(visible=False, elem_classes='file-saver') as shared.gradio['preset_saver']:
shared.gradio['save_preset_filename'] = gr.Textbox(lines=1, label='File name', info='The preset will be saved to your presets/ folder with this base filename.')
shared.gradio['save_preset_filename'] = gr.Textbox(lines=1, label='File name', info='The preset will be saved to your user_data/presets folder with this base filename.')
shared.gradio['save_preset_contents'] = gr.Textbox(lines=10, label='File contents')
with gr.Row():
shared.gradio['save_preset_cancel'] = gr.Button('Cancel', elem_classes="small-button")
@ -72,7 +72,7 @@ def create_event_handlers():
def handle_save_preset_confirm_click(filename, contents):
try:
utils.save_file(f"presets/{filename}.yaml", contents)
utils.save_file(f"user_data/presets/{filename}.yaml", contents)
available_presets = utils.get_available_presets()
output = gr.update(choices=available_presets, value=filename)
except Exception:
@ -145,7 +145,7 @@ def handle_save_preset_click(state):
def handle_delete_preset_click(preset):
return [
f"{preset}.yaml",
"presets/",
"user_data/presets/",
gr.update(visible=True)
]
@ -154,7 +154,7 @@ def handle_save_grammar_click(grammar_string):
return [
grammar_string,
"My Fancy Grammar.gbnf",
"grammars/",
"user_data/grammars/",
gr.update(visible=True)
]
@ -162,6 +162,6 @@ def handle_save_grammar_click(grammar_string):
def handle_delete_grammar_click(grammar_file):
return [
grammar_file,
"grammars/",
"user_data/grammars/",
gr.update(visible=True)
]

View file

@ -51,11 +51,11 @@ def create_ui():
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768.')
shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768.')
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
shared.gradio['ctx_size'] = gr.Number(label='ctx_size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
@ -70,6 +70,7 @@ def create_ui():
shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
@ -90,7 +91,18 @@ def create_ui():
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
# Speculative decoding
with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
with gr.Row():
shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')
shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
with gr.Column():
with gr.Row():
@ -211,9 +223,9 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
model_dir=shared.args.model_dir if shared.args.model_dir != shared.args_defaults.model_dir else None
)
if output_folder == Path("models"):
if output_folder == Path("user_data/models"):
output_folder = Path(shared.args.model_dir)
elif output_folder == Path("loras"):
elif output_folder == Path("user_data/loras"):
output_folder = Path(shared.args.lora_dir)
if check:
@ -234,10 +246,8 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
def update_truncation_length(current_length, state):
if 'loader' in state:
if state['loader'].lower().startswith('exllama'):
return state['max_seq_len']
elif state['loader'] == 'llama.cpp':
return state['n_ctx']
if state['loader'].lower().startswith('exllama') or state['loader'] == 'llama.cpp':
return state['ctx_size']
return current_length

View file

@ -93,7 +93,6 @@ def create_ui(default_preset):
shared.gradio['sampler_priority'] = gr.Textbox(value=generate_params['sampler_priority'], lines=12, label='Sampler priority', info='Parameter names separated by new lines or commas.', elem_classes=['add_scrollbar'])
shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=2, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='Written between "" and separated by commas.', placeholder='"\\n", "\\nYou:"')
shared.gradio['custom_token_bans'] = gr.Textbox(value=shared.settings['custom_token_bans'] or None, label='Token bans', info='Token IDs to ban, separated by commas. The IDs can be found in the Default or Notebook tab.')
shared.gradio['show_after'] = gr.Textbox(value=shared.settings['show_after'] or None, label='Show after', info='Hide the reply before this text.', placeholder="</think>")
shared.gradio['negative_prompt'] = gr.Textbox(value=shared.settings['negative_prompt'], label='Negative prompt', info='For CFG. Only used when guidance_scale is different than 1.', lines=3, elem_classes=['add_scrollbar'])
shared.gradio['dry_sequence_breakers'] = gr.Textbox(value=generate_params['dry_sequence_breakers'], label='dry_sequence_breakers', info='Tokens across which sequence matching is not continued. Specified as a comma-separated list of quoted strings.')
with gr.Row() as shared.gradio['grammar_file_row']:
@ -122,16 +121,14 @@ def create_event_handlers():
def get_truncation_length():
if 'max_seq_len' in shared.provided_arguments or shared.args.max_seq_len != shared.args_defaults.max_seq_len:
return shared.args.max_seq_len
elif 'n_ctx' in shared.provided_arguments or shared.args.n_ctx != shared.args_defaults.n_ctx:
return shared.args.n_ctx
if 'ctx_size' in shared.provided_arguments or shared.args.ctx_size != shared.args_defaults.ctx_size:
return shared.args.ctx_size
else:
return shared.settings['truncation_length']
def load_grammar(name):
p = Path(f'grammars/{name}')
p = Path(f'user_data/grammars/{name}')
if p.exists():
return open(p, 'r', encoding='utf-8').read()
else:

View file

@ -13,7 +13,7 @@ def create_ui():
shared.gradio['reset_interface'] = gr.Button("Apply flags/extensions and restart", interactive=not mu)
with gr.Row():
shared.gradio['toggle_dark_mode'] = gr.Button('Toggle 💡')
shared.gradio['save_settings'] = gr.Button('Save UI defaults to settings.yaml', interactive=not mu)
shared.gradio['save_settings'] = gr.Button('Save UI defaults to user_data/settings.yaml', interactive=not mu)
with gr.Row():
with gr.Column():
@ -48,7 +48,7 @@ def handle_save_settings(state, preset, extensions, show_controls, theme):
return [
contents,
"settings.yaml",
"./",
"user_data/",
gr.update(visible=True)
]

View file

@ -76,44 +76,54 @@ def get_available_models():
# Get all GGUF files
gguf_files = get_available_ggufs()
# Filter out non-first parts of multipart GGUF files
filtered_gguf_files = []
for gguf_path in gguf_files:
filename = os.path.basename(gguf_path)
match = re.search(r'-(\d+)-of-\d+\.gguf$', filename)
if match:
part_number = match.group(1)
# Keep only if it's part 1
if part_number.lstrip("0") == "1":
filtered_gguf_files.append(gguf_path)
else:
# Not a multi-part file
filtered_gguf_files.append(gguf_path)
model_dir = Path(shared.args.model_dir)
# Find top-level directories containing GGUF files
dirs_with_gguf = set()
for gguf_path in gguf_files:
path = Path(gguf_path)
if path.parts: # If in a subdirectory
dirs_with_gguf.add(path.parts[0]) # Add top-level directory
if len(path.parts) > 0:
dirs_with_gguf.add(path.parts[0])
# Find directories with safetensors files directly under them
# Find directories with safetensors files
dirs_with_safetensors = set()
for item in os.listdir(model_dir):
item_path = model_dir / item
if item_path.is_dir():
# Check if there are safetensors files directly under this directory
if any(file.lower().endswith(('.safetensors', '.pt')) for file in os.listdir(item_path) if (item_path / file).is_file()):
dirs_with_safetensors.add(item)
# Find valid model directories
model_dirs = []
for item in os.listdir(model_dir):
item_path = model_dir / item
# Skip if not a directory
if not item_path.is_dir():
continue
# Include directory if it either:
# 1. Doesn't contain GGUF files, OR
# 2. Contains both GGUF and safetensors files
# Include directory if it either doesn't contain GGUF files
# or contains both GGUF and safetensors files
if item not in dirs_with_gguf or item in dirs_with_safetensors:
model_dirs.append(item)
model_dirs = sorted(model_dirs, key=natural_keys)
# Combine all models
return ['None'] + gguf_files + model_dirs
return ['None'] + filtered_gguf_files + model_dirs
def get_available_ggufs():
@ -131,11 +141,11 @@ def get_available_ggufs():
def get_available_presets():
return sorted(set((k.stem for k in Path('presets').glob('*.yaml'))), key=natural_keys)
return sorted(set((k.stem for k in Path('user_data/presets').glob('*.yaml'))), key=natural_keys)
def get_available_prompts():
prompt_files = list(Path('prompts').glob('*.txt'))
prompt_files = list(Path('user_data/prompts').glob('*.txt'))
sorted_files = sorted(prompt_files, key=lambda x: x.stat().st_mtime, reverse=True)
prompts = [file.stem for file in sorted_files]
prompts.append('None')
@ -143,12 +153,12 @@ def get_available_prompts():
def get_available_characters():
paths = (x for x in Path('characters').iterdir() if x.suffix in ('.json', '.yaml', '.yml'))
paths = (x for x in Path('user_data/characters').iterdir() if x.suffix in ('.json', '.yaml', '.yml'))
return sorted(set((k.stem for k in paths)), key=natural_keys)
def get_available_instruction_templates():
path = "instruction-templates"
path = "user_data/instruction-templates"
paths = []
if os.path.exists(path):
paths = (x for x in Path(path).iterdir() if x.suffix in ('.json', '.yaml', '.yml'))
@ -179,4 +189,4 @@ def get_available_chat_styles():
def get_available_grammars():
return ['None'] + sorted([item.name for item in list(Path('grammars').glob('*.gbnf'))], key=natural_keys)
return ['None'] + sorted([item.name for item in list(Path('user_data/grammars').glob('*.gbnf'))], key=natural_keys)

View file

@ -28,14 +28,7 @@ conda_env_path = os.path.join(script_dir, "installer_files", "env")
state_file = '.installer_state.json'
# Command-line flags
cmd_flags_path = os.path.join(script_dir, "CMD_FLAGS.txt")
if os.path.exists(cmd_flags_path):
with open(cmd_flags_path, 'r') as f:
CMD_FLAGS = ' '.join(line.strip().rstrip('\\').strip() for line in f if line.strip().rstrip('\\').strip() and not line.strip().startswith('#'))
else:
CMD_FLAGS = ''
flags = f"{' '.join([flag for flag in sys.argv[1:] if flag != '--update-wizard'])} {CMD_FLAGS}"
flags = f"{' '.join([flag for flag in sys.argv[1:] if flag != '--update-wizard'])}"
def signal_handler(sig, frame):
@ -300,9 +293,10 @@ def install_webui():
# Write a flag to CMD_FLAGS.txt for CPU mode
if selected_gpu == "NONE":
cmd_flags_path = os.path.join(script_dir, "user_data", "CMD_FLAGS.txt")
with open(cmd_flags_path, 'r+') as cmd_flags_file:
if "--cpu" not in cmd_flags_file.read():
print_big_message("Adding the --cpu flag to CMD_FLAGS.txt.")
print_big_message("Adding the --cpu flag to user_data/CMD_FLAGS.txt.")
cmd_flags_file.write("\n--cpu\n")
# Handle CUDA version display
@ -538,7 +532,7 @@ if __name__ == "__main__":
flags_list = re.split(' +(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)|=', flags)
model_dir = [flags_list[(flags_list.index(flag) + 1)] for flag in flags_list if flag == '--model-dir'][0].strip('"\'')
else:
model_dir = 'models'
model_dir = 'user_data/models'
if len([item for item in glob.glob(f'{model_dir}/*') if not item.endswith(('.txt', '.yaml'))]) == 0:
print_big_message("You haven't downloaded any model yet.\nOnce the web UI launches, head over to the \"Model\" tab and download one.")

View file

@ -30,12 +30,12 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -29,6 +29,6 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -29,6 +29,6 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -29,7 +29,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -29,8 +29,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -29,5 +29,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -29,5 +29,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -30,12 +30,12 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,4 +15,4 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,4 +15,4 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"

View file

@ -15,6 +15,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+vulkan-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+vulkan-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+vulkanavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+vulkanavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,7 +1,6 @@
import os
import warnings
import modules.one_click_installer_check
from modules import shared
from modules.block_requests import OpenMonkeyPatch, RequestBlocker
from modules.logging_colors import logger
@ -94,8 +93,8 @@ def create_interface():
'filter_by_loader': shared.args.loader or 'All'
})
if Path("cache/pfp_character.png").exists():
Path("cache/pfp_character.png").unlink()
if Path("user_data/cache/pfp_character.png").exists():
Path("user_data/cache/pfp_character.png").unlink()
# css/js strings
css = ui.css
@ -112,8 +111,8 @@ def create_interface():
shared.gradio['interface_state'] = gr.State({k: None for k in shared.input_elements})
# Audio notification
if Path("notification.mp3").exists():
shared.gradio['audio_notification'] = gr.Audio(interactive=False, value="notification.mp3", elem_id="audio_notification", visible=False)
if Path("user_data/notification.mp3").exists():
shared.gradio['audio_notification'] = gr.Audio(interactive=False, value="user_data/notification.mp3", elem_id="audio_notification", visible=False)
# Floating menus for saving/deleting files
ui_file_saving.create_ui()
@ -179,7 +178,7 @@ def create_interface():
ssl_keyfile=shared.args.ssl_keyfile,
ssl_certfile=shared.args.ssl_certfile,
root_path=shared.args.subpath,
allowed_paths=["cache", "css", "extensions", "js"]
allowed_paths=["css", "js", "extensions", "user_data/cache"]
)
@ -192,10 +191,10 @@ if __name__ == "__main__":
settings_file = None
if shared.args.settings is not None and Path(shared.args.settings).exists():
settings_file = Path(shared.args.settings)
elif Path('settings.yaml').exists():
settings_file = Path('settings.yaml')
elif Path('settings.json').exists():
settings_file = Path('settings.json')
elif Path('user_data/settings.yaml').exists():
settings_file = Path('user_data/settings.yaml')
elif Path('user_data/settings.json').exists():
settings_file = Path('user_data/settings.json')
if settings_file is not None:
logger.info(f"Loading settings from \"{settings_file}\"")

View file

@ -1,11 +0,0 @@
@echo off
cd /D "%~dp0"
set PATH=%PATH%;%SystemRoot%\system32
@rem sed -i 's/\x0D$//' ./wsl.sh converts newlines to unix format in the wsl script
call wsl -e bash -lic "sed -i 's/\x0D$//' ./wsl.sh; source ./wsl.sh %*"
:end
pause

View file

@ -1,11 +0,0 @@
@echo off
cd /D "%~dp0"
set PATH=%PATH%;%SystemRoot%\system32
@rem sed -i 's/\x0D$//' ./wsl.sh converts newlines to unix format in the wsl script calling wsl.sh with 'update' will run updater
call wsl -e bash -lic "sed -i 's/\x0D$//' ./wsl.sh; source ./wsl.sh update-wizard"
:end
pause

3
user_data/CMD_FLAGS.txt Normal file
View file

@ -0,0 +1,3 @@
# Add persistent flags here to use every time you launch the web UI.
# Example:
# --listen --api

View file

Before

Width:  |  Height:  |  Size: 206 KiB

After

Width:  |  Height:  |  Size: 206 KiB

Some files were not shown because too many files have changed in this diff Show more