Compare commits

...

174 commits

Author SHA1 Message Date
Mykeehu
dddb887b60
Merge branch 'main' into patch-2 2025-06-04 08:46:55 +02:00
Mykeehu
6da5612fd8
Update chat.py for 3.4.0 version 2025-06-04 08:44:45 +02:00
oobabooga
ae61c1a0f4
Merge pull request #7034 from oobabooga/dev
Merge dev branch
2025-05-30 23:07:56 -03:00
oobabooga
15f466ca3f Update README 2025-05-30 15:49:57 -07:00
oobabooga
219f0a7731 Fix exllamav3_hf models failing to unload (closes #7031) 2025-05-30 12:05:49 -07:00
oobabooga
298d4719c6 Multiple small style improvements 2025-05-30 11:32:24 -07:00
oobabooga
7c29879e79 Fix 'Start reply with' (closes #7033) 2025-05-30 11:17:47 -07:00
oobabooga
af1eef1b08
Merge pull request #7028 from oobabooga/dev
Merge dev branch
2025-05-29 19:07:56 -03:00
oobabooga
28e6bd4fcd Revert "Update transformers requirement in /requirements/full (#7017)"
This reverts commit cc9b7253c1.
2025-05-29 14:49:07 -07:00
oobabooga
d1bfb08e8d Improve the style of message editing 2025-05-29 14:27:47 -07:00
oobabooga
acbcc12e7b Clean up 2025-05-29 14:11:21 -07:00
oobabooga
dce02732a4 Fix timestamp issues when editing/swiping messages 2025-05-29 14:08:48 -07:00
oobabooga
8078c41ec6 Revert "Bump llama.cpp"
This reverts commit a8d02dec8f.
2025-05-29 13:32:19 -07:00
oobabooga
a45a652130 CSS fix 2025-05-29 13:28:51 -07:00
oobabooga
f59998d268 Don't limit the number of prompt characters printed with --verbose 2025-05-29 13:08:48 -07:00
oobabooga
aff41f3482 Update README 2025-05-29 12:53:41 -07:00
oobabooga
e7129f9dbe Prevent footer buttons below last assistant message from always appearing 2025-05-29 12:47:07 -07:00
oobabooga
724147ffab Better detect when no model is available 2025-05-29 10:49:29 -07:00
oobabooga
faa5c82c64 Fix message version count not updating during regeneration streaming 2025-05-29 09:16:26 -07:00
oobabooga
3f37a2e915 Update README 2025-05-29 08:49:31 -07:00
oobabooga
c970c5f166 Make scrollbars darker in dark theme 2025-05-29 08:15:13 -07:00
oobabooga
81794692ab UI: Make the dark theme darker 2025-05-29 08:07:14 -07:00
oobabooga
36bc276005 Update README 2025-05-29 05:39:26 -07:00
oobabooga
0986d075fb Update README 2025-05-29 05:03:59 -07:00
oobabooga
9a94d7b4f6 Update README 2025-05-29 05:02:52 -07:00
oobabooga
2a9699033d Update README 2025-05-29 04:55:59 -07:00
oobabooga
f2ee917d4f Update README 2025-05-29 04:55:05 -07:00
oobabooga
685cfe2540 Lint 2025-05-29 04:26:43 -07:00
oobabooga
a8d02dec8f Bump llama.cpp 2025-05-29 04:24:21 -07:00
Underscore
63234b9b6f
UI: Fix impersonate (#7025) 2025-05-29 08:22:03 -03:00
oobabooga
75d6cfd14d Download fetched web search results in parallel 2025-05-28 20:36:24 -07:00
oobabooga
7080a02252 Reduce the timeout for downloading web pages 2025-05-28 18:15:21 -07:00
oobabooga
3eb0b77427 Improve the web search query generation 2025-05-28 18:14:51 -07:00
oobabooga
27641ac182 UI: Make message editing work the same for user and assistant messages 2025-05-28 17:23:46 -07:00
oobabooga
6c3590ba9a Make web search attachments clickable 2025-05-28 05:28:15 -07:00
oobabooga
0aedb89921 UI: Small style improvement to attachments 2025-05-28 00:35:20 -07:00
oobabooga
75c6ae8502 UI: Don't edit messages on double click 2025-05-28 00:29:17 -07:00
oobabooga
077bbc6b10
Add web search support (#7023) 2025-05-28 04:27:28 -03:00
oobabooga
1b0e2d8750 UI: Add a token counter to the chat tab (counts input + history) 2025-05-27 22:36:24 -07:00
oobabooga
f6ca0ee072 Fix regenerate sometimes not creating a new message version 2025-05-27 21:20:51 -07:00
oobabooga
2db36da979 UI: Make scrollbars more discrete in dark mode 2025-05-27 21:00:11 -07:00
Underscore
5028480eba
UI: Add footer buttons for editing messages (#7019)
---------

Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com>
2025-05-28 00:55:27 -03:00
Underscore
355b5f6c8b
UI: Add message version navigation (#6947)
---------

Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com>
2025-05-27 22:54:18 -03:00
dependabot[bot]
cc9b7253c1
Update transformers requirement in /requirements/full (#7017) 2025-05-26 23:13:10 -03:00
Underscore
8531100109
Fix textbox text usage in methods (#7009) 2025-05-26 22:40:09 -03:00
djholtby
73bfc936a0
Close response generator when stopping API generation (#7014) 2025-05-26 22:39:03 -03:00
oobabooga
bae1aa34aa Fix loading Llama-3_3-Nemotron-Super-49B-v1 and similar models (closes #7012) 2025-05-25 17:19:26 -07:00
oobabooga
7f6579ab20 Minor style change 2025-05-20 21:49:44 -07:00
oobabooga
0d3f854778 Improve the style of thinking blocks 2025-05-20 21:40:42 -07:00
oobabooga
8620d6ffe7 Make it possible to upload multiple text files/pdfs at once 2025-05-20 21:34:07 -07:00
oobabooga
cc8a4fdcb1 Minor improvement to attachments prompt format 2025-05-20 21:31:18 -07:00
oobabooga
409a48d6bd
Add attachments support (text files, PDF documents) (#7005) 2025-05-21 00:36:20 -03:00
oobabooga
5d00574a56 Minor UI fixes 2025-05-20 16:20:49 -07:00
oobabooga
51c50b265d Update llama.cpp to b7a17463ec 2025-05-20 11:16:12 -07:00
oobabooga
616ea6966d
Store previous reply versions on regenerate (#7004) 2025-05-20 12:51:28 -03:00
Daniel Dengler
c25a381540
Add a "Branch here" footer button to chat messages (#6967) 2025-05-20 11:07:40 -03:00
oobabooga
8e10f9894a
Add a metadata field to the chat history & add date/time to chat messages (#7003) 2025-05-20 10:48:46 -03:00
oobabooga
9ec46b8c44 Remove the HQQ loader (HQQ models can be loaded through Transformers) 2025-05-19 09:23:24 -07:00
oobabooga
0c7237e4b7 Update README 2025-05-18 20:01:29 -07:00
oobabooga
bad1da99db Merge remote-tracking branch 'refs/remotes/origin/dev' into dev 2025-05-18 14:09:08 -07:00
oobabooga
0c1bc6d1d0 Bump llama.cpp 2025-05-18 14:08:54 -07:00
Tiago Silva
9cd6ea6c0b
Fix Dockerfile in AMD and Intel (#6995) 2025-05-18 18:07:16 -03:00
oobabooga
83bfd5c64b Fix API issues 2025-05-18 12:45:01 -07:00
oobabooga
126b3a768f Revert "Dynamic Chat Message UI Update Speed (#6952)" (for now)
This reverts commit 8137eb8ef4.
2025-05-18 12:38:36 -07:00
oobabooga
9d7a36356d Remove unnecessary js that was causing scrolling issues 2025-05-18 10:56:16 -07:00
oobabooga
2faaf18f1f Add back the "Common values" to the ctx-size slider 2025-05-18 09:06:20 -07:00
oobabooga
f1ec6c8662 Minor label changes 2025-05-18 09:04:51 -07:00
oobabooga
bd13a8f255 UI: Light theme improvement 2025-05-17 22:31:55 -07:00
oobabooga
076aa67963 Fix API issues 2025-05-17 22:22:18 -07:00
oobabooga
366de4b561 UI: Fix the chat area height when "Show controls" is unchecked 2025-05-17 17:11:38 -07:00
oobabooga
e8595730b4
Merge pull request #6992 from oobabooga/dev
Merge dev branch
2025-05-17 11:58:46 -03:00
oobabooga
61276f6a37 Merge remote-tracking branch 'refs/remotes/origin/dev' into dev 2025-05-17 07:22:51 -07:00
oobabooga
4800d1d522 More robust VRAM calculation 2025-05-17 07:20:38 -07:00
mamei16
052c82b664
Fix KeyError: 'gpu_layers' when loading existing model settings (#6991) 2025-05-17 11:19:13 -03:00
oobabooga
0f77ff9670 UI: Use total VRAM (not free) for layers calculation when a model is loaded 2025-05-16 19:19:22 -07:00
oobabooga
17c29fa0a2
Merge pull request #6987 from oobabooga/dev
Merge dev branch
2025-05-16 22:23:59 -03:00
oobabooga
4bf763e1d9 Multiple small CSS fixes 2025-05-16 18:22:43 -07:00
oobabooga
c0e295dd1d Remove the 'None' option from the model menu 2025-05-16 17:53:20 -07:00
oobabooga
e3bba510d4 UI: Only add a blank space to streaming messages in instruct mode 2025-05-16 17:49:17 -07:00
oobabooga
71fa046c17 Minor changes after 1c549d176b 2025-05-16 17:38:08 -07:00
oobabooga
d99fb0a22a Add backward compatibility with saved n_gpu_layers values 2025-05-16 17:29:18 -07:00
oobabooga
1c549d176b Fix GPU layers slider: honor saved settings and show true maximum 2025-05-16 17:26:13 -07:00
oobabooga
dc3094549e
Merge pull request #6984 from oobabooga/dev
Merge dev branch
2025-05-16 17:13:26 -03:00
oobabooga
e4d3f4449d API: Fix a regression 2025-05-16 13:02:27 -07:00
oobabooga
470c822f44 API: Hide the uvicorn access logs from the terminal 2025-05-16 12:54:39 -07:00
oobabooga
adb975a380 Prevent fractional gpu-layers in the UI 2025-05-16 12:52:43 -07:00
oobabooga
fc483650b5 Set the maximum gpu_layers value automatically when the model is loaded with --model 2025-05-16 11:58:17 -07:00
oobabooga
38c50087fe Prevent a crash on systems without an NVIDIA GPU 2025-05-16 11:55:30 -07:00
oobabooga
253e85a519 Only compute VRAM/GPU layers for llama.cpp models 2025-05-16 10:02:30 -07:00
oobabooga
9ec9b1bf83 Auto-adjust GPU layers after model unload to utilize freed VRAM 2025-05-16 09:56:23 -07:00
oobabooga
ee7b3028ac Always cache GGUF metadata calls 2025-05-16 09:12:36 -07:00
oobabooga
4925c307cf Auto-adjust GPU layers on context size and cache type changes + many fixes 2025-05-16 09:07:38 -07:00
oobabooga
93e1850a2c Only show the VRAM info for llama.cpp 2025-05-15 21:42:15 -07:00
oobabooga
cbf4daf1c8 Hide the LoRA menu in portable mode 2025-05-15 21:21:54 -07:00
oobabooga
fd61297933 Lint 2025-05-15 21:19:19 -07:00
oobabooga
8cb73b78e1 Update ExLlamaV3 2025-05-15 20:10:34 -07:00
oobabooga
041248cc9f Update llama.cpp 2025-05-15 20:10:02 -07:00
oobabooga
5534d01da0
Estimate the VRAM for GGUF models + autoset gpu-layers (#6980) 2025-05-16 00:07:37 -03:00
oobabooga
c4a715fd1e UI: Move the LoRA menu under "Other options" 2025-05-13 20:14:09 -07:00
oobabooga
035cd3e2a9 UI: Hide the extension install menu in portable builds 2025-05-13 20:09:22 -07:00
oobabooga
2826c60044 Use logger for "Output generated in ..." messages 2025-05-13 14:45:46 -07:00
oobabooga
3fa1a899ae UI: Fix gpu-layers being ignored (closes #6973) 2025-05-13 12:07:59 -07:00
oobabooga
c375b69413 API: Fix llama.cpp generating after disconnect, improve disconnect detection, fix deadlock on simultaneous requests 2025-05-13 11:23:33 -07:00
oobabooga
62c774bf24 Revert "New attempt"
This reverts commit e7ac06c169.
2025-05-13 06:42:25 -07:00
oobabooga
e7ac06c169 New attempt 2025-05-10 19:20:04 -07:00
oobabooga
0c5fa3728e Revert "Fix API failing to cancel streams (attempt), closes #6966"
This reverts commit 006a866079.
2025-05-10 19:12:40 -07:00
oobabooga
006a866079 Fix API failing to cancel streams (attempt), closes #6966 2025-05-10 17:55:48 -07:00
oobabooga
47d4758509 Fix #6970 2025-05-10 17:46:00 -07:00
oobabooga
4920981b14 UI: Remove the typing cursor 2025-05-09 20:35:38 -07:00
oobabooga
8984e95c67 UI: More friendly message when no model is loaded 2025-05-09 07:21:05 -07:00
oobabooga
2bde625d57 Update README 2025-05-09 00:19:25 -07:00
oobabooga
512bc2d0e0 UI: Update some labels 2025-05-08 23:43:55 -07:00
oobabooga
f8ef6e09af UI: Make ctx-size a slider 2025-05-08 18:19:04 -07:00
oobabooga
bf7e4a4597 Docs: Add a tool/function calling example (from https://github.com/oobabooga/text-generation-webui/pull/6827#issuecomment-2854716960) 2025-05-08 16:12:07 -07:00
oobabooga
9ea2a69210 llama.cpp: Add --no-webui to the llama-server command 2025-05-08 10:41:25 -07:00
oobabooga
3bc2ec2b11 Fix #6965 2025-05-08 10:34:09 -07:00
oobabooga
1c7209a725 Save the chat history periodically during streaming 2025-05-08 09:46:43 -07:00
oobabooga
a1b3307b66 Bump llama.cpp 2025-05-08 08:58:43 -07:00
Jonas
fa960496d5
Tools support for OpenAI compatible API (#6827) 2025-05-08 12:30:27 -03:00
Scott Z
ed6e16191d
Docker fix for NVIDIA (#6964) 2025-05-08 12:21:52 -03:00
oobabooga
13a434f351 Bump exllamav3 2025-05-08 08:06:07 -07:00
oobabooga
a2ab42d390 UI: Remove the exllamav2 info message 2025-05-08 08:00:38 -07:00
oobabooga
348d4860c2 UI: Create a "Main options" section in the Model tab 2025-05-08 07:58:59 -07:00
oobabooga
d2bae7694c UI: Change the ctx-size description 2025-05-08 07:26:23 -07:00
oobabooga
b28fa86db6 Default --gpu-layers to 256 2025-05-06 17:51:55 -07:00
oobabooga
760b4dd115 Merge remote-tracking branch 'refs/remotes/origin/dev' into dev 2025-05-06 14:02:57 -07:00
oobabooga
e4fb2475d2 UI: Multiple small style improvements (light/dark themes) 2025-05-06 14:02:15 -07:00
Downtown-Case
5ef564a22e
Fix model config loading in shared.py for Python 3.13 (#6961) 2025-05-06 17:03:33 -03:00
oobabooga
c4f36db0d8 llama.cpp: remove tfs (it doesn't get used) 2025-05-06 08:41:13 -07:00
oobabooga
05115e42ee Set top_n_sigma before temperature by default 2025-05-06 08:27:21 -07:00
oobabooga
1927afe894 Fix top_n_sigma not showing for llama.cpp 2025-05-06 08:18:49 -07:00
oobabooga
605cc9ab14 Update exllamav3 2025-05-06 06:43:35 -07:00
oobabooga
89590adc14 Update llama.cpp 2025-05-06 06:41:17 -07:00
oobabooga
d1c0154d66 llama.cpp: Add top_n_sigma, fix typical_p in sampler priority 2025-05-06 06:38:39 -07:00
oobabooga
cbef35054c UI: CSS fix 2025-05-05 17:46:09 -07:00
Evgenii Novikov
4e8f628d3c
docker: App uid typo in other docker composes (#6958) 2025-05-05 20:05:15 -03:00
oobabooga
530223bf0b UI: Fix the hover menu colors 2025-05-05 16:03:43 -07:00
oobabooga
76f947e3cf UI: Minor style change 2025-05-05 15:58:29 -07:00
Alireza Ghasemi
99bd66445f
SuperboogaV2: minor update to avoid json serialization errors #6945 2025-05-05 19:04:06 -03:00
Evgenii Novikov
987505ead3
docker: Fix app uid typo in cpu docker compose (#6957) 2025-05-05 19:03:33 -03:00
oobabooga
941e0663da Update README 2025-05-05 14:18:16 -07:00
oobabooga
f82667f0b4 Remove more multimodal extension references 2025-05-05 14:17:00 -07:00
oobabooga
85bf2e15b9 API: Remove obsolete multimodal extension handling
Multimodal support will be added back once it's implemented in llama-server.
2025-05-05 14:14:48 -07:00
mamei16
8137eb8ef4
Dynamic Chat Message UI Update Speed (#6952) 2025-05-05 18:05:23 -03:00
oobabooga
53d8e46502 Ensure environment isolation in portable installs 2025-05-05 12:28:17 -07:00
oobabooga
bf5290bc0f Fix the hover menu in light theme 2025-05-05 08:04:12 -07:00
oobabooga
967b70327e Light theme improvement 2025-05-05 07:59:02 -07:00
oobabooga
6001d279c6 Light theme improvement 2025-05-05 07:42:13 -07:00
oobabooga
475e012ee8 UI: Improve the light theme colors 2025-05-05 06:16:29 -07:00
oobabooga
b817bb33fd Minor fix after df7bb0db1f 2025-05-05 05:00:20 -07:00
oobabooga
f3da45f65d ExLlamaV3_HF: Change max_chunk_size to 256 2025-05-04 20:37:15 -07:00
oobabooga
df7bb0db1f Rename --n-gpu-layers to --gpu-layers 2025-05-04 20:03:55 -07:00
oobabooga
d0211afb3c Save the chat history right after sending a message 2025-05-04 18:52:01 -07:00
oobabooga
2da197bba4 Refinement after previous commit 2025-05-04 18:29:05 -07:00
oobabooga
690d693913 UI: Add padding to only show the last message/reply after sending a message
To avoid scrolling
2025-05-04 18:13:29 -07:00
oobabooga
d9da16edba UI: Remove the chat input textarea border 2025-05-04 16:53:52 -07:00
oobabooga
84ab1f95be UI: Increase the chat area a bit 2025-05-04 15:21:52 -07:00
oobabooga
d186621926 UI: Fixes after previous commit 2025-05-04 15:19:46 -07:00
oobabooga
7853fb1c8d
Optimize the Chat tab (#6948) 2025-05-04 18:58:37 -03:00
oobabooga
b7a5c7db8d llama.cpp: Handle short arguments in --extra-flags 2025-05-04 07:14:42 -07:00
oobabooga
5f5569e9ac Update README 2025-05-04 06:20:36 -07:00
oobabooga
4c2e3b168b llama.cpp: Add a retry mechanism when getting the logits (sometimes it fails) 2025-05-03 06:51:20 -07:00
oobabooga
ea60f14674 UI: Show the list of files if the user tries to download a GGUF repository 2025-05-03 06:06:50 -07:00
oobabooga
b71ef50e9d UI: Add a min-height to prevent constant scrolling during chat streaming 2025-05-02 23:45:58 -07:00
oobabooga
b21bd8bb1e UI: Invert user/assistant message colors in instruct mode
The goal is to make assistant messages more readable.
2025-05-02 22:43:33 -07:00
oobabooga
d08acb4af9 UI: Rename enable_thinking -> Enable thinking 2025-05-02 20:50:52 -07:00
oobabooga
3526b7923c Remove extensions with requirements from portable builds 2025-05-02 17:40:53 -07:00
oobabooga
4cea720da8 UI: Remove the "Autoload the model" feature 2025-05-02 16:38:28 -07:00
oobabooga
905afced1c Add a --portable flag to hide things in portable mode 2025-05-02 16:34:29 -07:00
oobabooga
3f26b0408b Fix after 9e3867dc83 2025-05-02 16:17:22 -07:00
oobabooga
9e3867dc83 llama.cpp: Fix manual random seeds 2025-05-02 09:36:15 -07:00
oobabooga
d5c407cf35 Use Vulkan instead of ROCm for llama.cpp on AMD 2025-05-01 20:05:36 -07:00
oobabooga
f8aaf3c23a Use ROCm 6.2.4 on AMD 2025-05-01 19:50:46 -07:00
oobabooga
c12a53c998 Use turboderp's exllamav2 wheels 2025-05-01 19:46:56 -07:00
72 changed files with 2414 additions and 664 deletions

View file

@ -102,6 +102,8 @@ jobs:
shell: bash
run: |
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables
CUDA_VERSION="${{ matrix.cuda }}"

View file

@ -101,6 +101,8 @@ jobs:
shell: bash
run: |
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables
AVX_SUPPORT="${{ matrix.avx }}"

View file

@ -101,6 +101,8 @@ jobs:
shell: bash
run: |
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables
AVX_SUPPORT="${{ matrix.avx }}"

View file

@ -12,18 +12,20 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
## Features
- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2).
- [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is also supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile).
- Additional quantization libraries like [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) can be used with the Transformers loader if you install them manually.
- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for llama.cpp GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
- UI that resembles the original ChatGPT style.
- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
- Multiple sampling parameters and generation options for sophisticated text generation control.
- Switch between different models easily in the UI without restarting, with fine control over settings.
- OpenAI-compatible API with Chat and Completions endpoints see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
- 100% offline and private, with zero telemetry, external resources, or remote update requests.
- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
- **File attachments**: Upload text files and PDF documents to talk about their contents.
- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
- Aesthetic UI with dark and light themes.
- `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters.
- Edit messages, navigate between message versions, and branch conversations at any point.
- Multiple sampling parameters and generation options for sophisticated text generation control.
- Switch between different models in the UI without restarting.
- Automatic GPU layers for GGUF models (on NVIDIA GPUs).
- Free-form text generation in the Default/Notebook tabs without being limited to chat turns.
- OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
- Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
## How to install
@ -44,7 +46,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases
To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again.
You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, or `update_wizard_macos.sh`.
<details>
<summary>
@ -55,12 +57,12 @@ Setup details and information about installing manually
The script uses Miniconda to set up a Conda environment in the `installer_files` folder.
If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, `cmd_macos.sh`, or `cmd_wsl.bat`.
If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, or `cmd_macos.sh`.
* There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root.
* To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts.
* For additional instructions about AMD and WSL setup, consult [the documentation](https://github.com/oobabooga/text-generation-webui/wiki).
* For automated installation, you can use the `GPU_CHOICE`, `USE_CUDA118`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
* For automated installation, you can use the `GPU_CHOICE`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
### Manual installation using Conda
@ -90,7 +92,7 @@ conda activate textgen
|--------|---------|---------|
| Linux/WSL | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
| Linux/WSL | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu` |
| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.1` |
| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.2.4` |
| MacOS + MPS | Any | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
| Windows | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
| Windows | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
@ -146,14 +148,14 @@ The `requirements*.txt` above contain various wheels precompiled through GitHub
For NVIDIA GPU:
ln -s docker/{nvidia/Dockerfile,nvidia/docker-compose.yml,.dockerignore} .
For AMD GPU:
ln -s docker/{amd/Dockerfile,intel/docker-compose.yml,.dockerignore} .
ln -s docker/{amd/Dockerfile,amd/docker-compose.yml,.dockerignore} .
For Intel GPU:
ln -s docker/{intel/Dockerfile,amd/docker-compose.yml,.dockerignore} .
For CPU only
ln -s docker/{cpu/Dockerfile,cpu/docker-compose.yml,.dockerignore} .
cp docker/.env.example .env
#Create logs/cache dir :
mkdir -p logs cache
mkdir -p user_data/logs user_data/cache
# Edit .env and set:
# TORCH_CUDA_ARCH_LIST based on your GPU model
# APP_RUNTIME_GID your host user's group id (run `id -g` in a terminal)
@ -187,13 +189,13 @@ usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--
[--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
[--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit]
[--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap]
[--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N]
[--mlock] [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N]
[--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
[--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner]
[--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
[--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
[--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--api] [--public-api]
[--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
[--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
[--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
[--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
[--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY]
[--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
Text generation web UI
@ -215,7 +217,7 @@ Basic settings:
--idle-timeout IDLE_TIMEOUT Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
Model loader:
--loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ,
--loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2,
TensorRT-LLM.
Transformers/Accelerate:
@ -246,16 +248,18 @@ llama.cpp:
--batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval.
--no-mmap Prevent mmap from being used.
--mlock Force the system to keep the model in RAM.
--n-gpu-layers N_GPU_LAYERS Number of layers to offload to the GPU.
--gpu-layers N, --n-gpu-layers N Number of layers to offload to the GPU.
--tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
--numa Activate NUMA task allocation for llama.cpp.
--no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
--row-split Split the model by rows across GPUs. This may improve multi-gpu performance.
--extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"
--extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"
--streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
Context and cache management:
Context and cache:
--ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens.
--cache-type N, --cache_type N KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits
separately, e.g. q4_q8).
Speculative decoding:
--model-draft MODEL_DRAFT Path to the draft model for speculative decoding.
@ -274,15 +278,9 @@ ExLlamaV2:
--num_experts_per_token N Number of experts to use for generation. Applies to MoE models like Mixtral.
--enable_tp Enable Tensor Parallelism (TP) in ExLlamaV2.
HQQ:
--hqq-backend HQQ_BACKEND Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
TensorRT-LLM:
--cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
Cache:
--cache_type CACHE_TYPE KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
DeepSpeed:
--deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
--nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
@ -305,6 +303,7 @@ Gradio:
--ssl-certfile SSL_CERTFILE The path to the SSL certificate cert file.
--subpath SUBPATH Customize the subpath for gradio, use with reverse proxy
--old-colors Use the legacy Gradio colors, before the December/2024 update.
--portable Hide features not available in portable mode like training.
API:
--api Enable the API extension.

View file

@ -1,7 +1,9 @@
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 28px;
padding-bottom: 22px;
padding-top: 6px;
font-size: 18px;
font-family: Roboto, Arial, sans-serif; /* Modern font */
line-height: 1.5;
@ -102,6 +104,7 @@
@media screen and (width <= 688px) {
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 25px;
font-size: 15px;

View file

@ -2,8 +2,10 @@
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 28px;
padding-bottom: 21px;
padding-top: 7px;
font-size: 18px;
font-family: 'Noto Sans', Arial, sans-serif;
line-height: 1.428571429;
@ -100,6 +102,7 @@
@media screen and (width <= 688px) {
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 25px;
font-size: 15px;

View file

@ -16,6 +16,7 @@
}
.message {
padding-bottom: 2em;
padding-bottom: 1.5em;
padding-top: 0.5em;
grid-template-columns: 70px minmax(0, 1fr);
}

View file

@ -1,7 +1,9 @@
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 2em;
padding-bottom: 1.5em;
padding-top: 0.5em;
font-size: 15px;
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
line-height: 22.5px !important;

View file

@ -1,5 +1,6 @@
.message {
padding-bottom: 25px;
padding-bottom: 22px;
padding-top: 3px;
font-size: 15px;
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
line-height: 1.428571429;

View file

@ -1,5 +1,6 @@
.message {
padding-bottom: 25px;
padding-bottom: 22px;
padding-top: 3px;
font-size: 15px;
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
line-height: 1.428571429;

View file

@ -8,10 +8,6 @@
padding-top: 0 !important;
}
.chat > .messages > :last-child {
margin-bottom: 1.7rem !important;
}
.chat .message-body p, .chat .message-body li {
font-size: 1rem !important;
line-height: 28px !important;
@ -46,7 +42,7 @@
}
.chat .user-message {
background: #f5f5f5;
background: #f3f4f6;
padding: 1.5rem 1rem;
padding-bottom: 2rem;
border-radius: 0;
@ -61,16 +57,16 @@
}
.dark .chat .user-message {
background: transparent;
background: var(--light-gray);
}
.dark .chat .assistant-message {
background: var(--light-gray);
background: transparent;
}
.chat .user-message .text,
.chat .assistant-message .text {
max-width: 645px;
max-width: 700px;
margin-left: auto;
margin-right: auto;
}

View file

@ -1,11 +1,11 @@
:root {
--darker-gray: #202123;
--dark-gray: #343541;
--light-gray: #444654;
--light-theme-gray: #f5f5f5;
--dark-gray: #2A2B32;
--light-gray: #373943;
--light-theme-gray: #f9fbff;
--border-color-dark: #525252;
--header-width: 112px;
--selected-item-color-dark: #32333e;
--selected-item-color-dark: #2E2F38;
}
@font-face {
@ -131,7 +131,7 @@ gradio-app > :first-child {
}
.header_bar {
box-shadow: 0 0 3px rgba(22 22 22 / 35%);
border-right: var(--input-border-width) solid var(--input-border-color);
margin-bottom: 0;
overflow-x: scroll;
text-wrap: nowrap;
@ -265,7 +265,7 @@ button {
.dark .pretty_scrollbar::-webkit-scrollbar-thumb,
.dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
background: #ccc;
background: rgb(255 255 255 / 10%);
border-radius: 10px;
}
@ -389,8 +389,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
.chat {
margin-left: auto;
margin-right: auto;
min-height: var(--chat-height);
overflow-y: auto;
flex: 1;
overflow-y: hidden;
display: flex;
flex-direction: column;
word-break: break-word;
@ -401,10 +401,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
.chat-parent {
height: calc(100dvh - 98px - var(--input-delta));
flex: 1;
overflow: auto !important;
border-radius: 0 !important;
margin-bottom: var(--input-delta) !important;
}
.chat-parent .prose {
@ -420,14 +419,22 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
padding-right: 1rem;
}
.chat .message .timestamp {
font-size: 0.7em;
display: inline-block;
font-weight: normal;
opacity: 0.7;
margin-left: 5px;
}
.chat-parent.bigchat {
height: calc(100dvh - 98px - var(--input-delta)) !important;
margin-bottom: var(--input-delta) !important;
flex: 1;
}
.chat > .messages {
display: flex;
flex-direction: column;
min-height: calc(100vh - 102px);
}
.chat > .messages > :first-child {
@ -546,7 +553,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
border-radius: 5px;
font-size: 82%;
padding: 1px 3px;
background: white !important;
background: #f3f4f6 !important;
color: #1f2328;
}
@ -560,18 +567,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
padding: 15px;
}
.message-body :not(pre) > code::before {
content: "`";
}
.message-body :not(pre) > code::after {
content: "`";
}
.message-body :not(pre) > code {
white-space: normal !important;
font-weight: bold;
font-family: unset;
font-size: 0.95em;
font-family: Menlo,"Roboto Mono","Courier New",Courier,monospace,Inter,sans-serif;
padding: .15rem .3rem;
background-color: #ececec;
}
.dark .message-body :not(pre) > code {
background-color: rgb(255 255 255 / 10%);
}
#chat-input {
@ -582,7 +588,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
#chat-input textarea {
background: #f3f4f6;
padding: 0.65rem 2.5rem;
border: 0;
box-shadow: 0;
border-radius: 8px;
}
#chat-input textarea::placeholder {
@ -602,9 +612,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
display: none;
}
#chat-input .submit-button {
display: none;
}
#chat-input .upload-button {
margin-right: 16px;
margin-bottom: 7px;
background: transparent;
}
.chat-input-positioned {
position: absolute;
bottom: 0;
max-width: 54rem;
left: 50%;
transform: translateX(-50%);
@ -744,7 +762,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
.hover-menu button {
width: 100%;
background: transparent !important;
background: white !important;
border-radius: 0 !important;
justify-content: space-between;
margin: 0 !important;
@ -760,7 +778,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
.hover-menu button:hover {
background: var(--button-secondary-background-fill-hover) !important;
background: #dbeafe !important;
}
.dark .hover-menu button:hover {
background: var(--selected-item-color-dark) !important;
}
.transparent-substring {
@ -789,6 +811,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
#chat-input-container {
display: flex;
flex-direction: column;
min-width: 0 !important;
}
@ -798,9 +822,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
#chat-input-row {
padding-bottom: 1.5em;
padding-left: 1rem;
padding-right: 1rem;
padding: 1rem;
padding-top: 0;
}
#chat-input-row.bigchat {
@ -808,27 +831,22 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
#chat-col {
padding-bottom: 100px;
height: 100dvh;
display: flex;
flex-direction: column;
padding-bottom: 0;
gap: 0;
}
@media screen and (width <= 924px) {
#chat-col {
padding-bottom: 100px;
margin-top: 32px;
position: relative; /* Ensure positioning for the pseudo-element */
}
.chat-parent {
height: calc(100dvh - 98px - var(--input-delta) - 32px);
}
.chat-parent.bigchat {
height: calc(100dvh - 98px - var(--input-delta) - 32px) !important;
height: calc(100dvh - 32px);
}
}
#chat-col.bigchat {
padding-bottom: 80px !important;
padding-bottom: 15px !important;
}
.message-body ol, .message-body ul {
@ -985,6 +1003,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
cursor: pointer;
}
#past-chats .selected,
#past-chats label:hover {
background-color: #dbeafe !important;
}
#past-chats-buttons,
#delete-chat-row,
#rename-row {
@ -993,7 +1016,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
gap: 9px;
}
#past-chats-row,
#chat-controls {
width: 260px;
@ -1111,12 +1133,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
color: #9ca3af;
}
.dark .hover-menu {
background-color: var(--darker-gray);
}
.dark .hover-menu button {
border-color: var(--border-color-primary);
background-color: var(--darker-gray) !important;
}
.dark #chat-controls,
@ -1125,8 +1144,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
border: 0 !important;
}
.dark #past-chats .selected,
.dark #past-chats label:hover {
.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats .selected,
.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats label:hover {
background-color: var(--selected-item-color-dark) !important;
}
@ -1163,7 +1182,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
.header_bar button.selected {
background: #E0E0E0;
background: #dbeafe;
}
#chat-controls,
@ -1171,11 +1190,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
background-color: var(--light-theme-gray);
}
#chat-controls {
.dark #chat-controls {
border-left: 1px solid #d9d9d0;
}
#past-chats-row {
.dark #past-chats-row {
border-right: 1px solid #d9d9d0;
}
@ -1236,42 +1255,31 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
position: relative;
}
.footer-button {
/* New container for the buttons */
.message-actions {
position: absolute;
bottom: -23px;
left: 0;
display: flex;
gap: 5px;
opacity: 0;
transition: opacity 0.2s;
}
.footer-button {
padding: 0;
margin: 0;
border: none;
border-radius: 3px;
cursor: pointer;
opacity: 0;
display: flex;
align-items: center;
transition: opacity 0.2s;
justify-content: center;
}
.footer-button.footer-copy-button {
bottom: -23px;
left: 0;
}
.footer-button.footer-refresh-button {
bottom: -23px;
left: 25px;
}
.footer-button.footer-continue-button {
bottom: -23px;
left: 50px;
}
.footer-button.footer-remove-button {
bottom: -23px;
left: 75px;
}
.message:hover .footer-button,
.user-message:hover .footer-button,
.assistant-message:hover .footer-button {
.message:hover .message-actions,
.user-message:hover .message-actions,
.assistant-message:hover .message-actions {
opacity: 1;
}
@ -1362,6 +1370,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
contain: layout;
}
.chat .message-body .thinking-content p,
.chat .message-body .thinking-content li {
font-size: 15px !important;
}
/* Animation for opening thinking blocks */
@keyframes fadeIn {
from { opacity: 0; }
@ -1382,3 +1395,163 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
50% { opacity: 1; }
100% { opacity: 0.6; }
}
strong {
font-weight: bold;
}
.min.svelte-1ybaih5 {
min-height: 0;
}
#vram-info .value {
color: #008d00;
}
.dark #vram-info .value {
color: #07ff07;
}
.message-attachments {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-top: 8px;
padding-bottom: 6px;
}
.attachment-box {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
padding: 8px;
background: rgb(0 0 0 / 5%);
border-radius: 6px;
border: 1px solid rgb(0 0 0 / 10%);
min-width: 80px;
max-width: 120px;
}
.attachment-icon {
margin-bottom: 4px;
color: #555;
}
.attachment-name {
font-size: 0.8em;
text-align: center;
word-break: break-word;
overflow: hidden;
text-overflow: ellipsis;
display: -webkit-box;
-webkit-line-clamp: 2;
-webkit-box-orient: vertical;
}
.dark .attachment-box {
background: rgb(255 255 255 / 5%);
border: 1px solid rgb(255 255 255 / 10%);
}
.dark .attachment-icon {
color: #ccc;
}
/* Message Editing Styles */
.editing-textarea {
width: 100%;
min-height: 200px;
max-height: 65vh;
padding: 10px;
border-radius: 5px;
border: 1px solid #ccc;
background-color: var(--light-theme-gray);
font-family: inherit;
font-size: inherit;
resize: vertical;
}
.dark .editing-textarea {
border: 1px solid var(--border-color-dark);
background-color: var(--darker-gray);
}
.editing-textarea:focus {
outline: none;
border-color: var(--selected-item-color-dark);
}
.edit-controls-container {
margin-top: 0;
display: flex;
gap: 8px;
padding-bottom: 8px;
}
.edit-control-button {
padding: 6px 12px;
border: 1px solid #ccc;
border-radius: 4px;
cursor: pointer;
background-color: #f8f9fa;
color: #212529;
font-size: 12px;
margin: 0;
}
.dark .edit-control-button {
border: 1px solid var(--border-color-dark);
background-color: var(--light-gray);
color: #efefef;
}
/* --- Simple Version Navigation --- */
.version-navigation {
position: absolute;
bottom: -23px;
right: 0;
display: flex;
align-items: center;
gap: 5px;
opacity: 0;
transition: opacity 0.2s;
}
.message:hover .version-navigation,
.user-message:hover .version-navigation,
.assistant-message:hover .version-navigation {
opacity: 1;
}
.version-nav-button {
padding: 2px 6px;
font-size: 12px;
min-width: auto;
}
.version-nav-button[disabled] {
opacity: 0.3;
cursor: not-allowed;
}
.version-position {
font-size: 11px;
color: currentcolor;
font-family: monospace;
min-width: 35px;
text-align: center;
opacity: 0.8;
user-select: none;
}
.token-display {
font-family: monospace;
font-size: 13px;
color: var(--body-text-color-subdued);
margin-top: 4px;
}
button:focus {
outline: none;
}

View file

@ -14,7 +14,7 @@ WORKDIR /home/app/
RUN git clone https://github.com/oobabooga/text-generation-webui.git
WORKDIR /home/app/text-generation-webui
RUN GPU_CHOICE=B LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
COPY CMD_FLAGS.txt /home/app/text-generation-webui/
COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
WORKDIR /home/app/text-generation-webui
# set umask to ensure group read / write at runtime

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972}
APP_UID: ${APP_UID:-6972}
env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports:
@ -41,14 +41,4 @@ services:
security_opt:
- seccomp=unconfined
volumes:
- ./cache:/home/app/text-generation-webui/cache
- ./characters:/home/app/text-generation-webui/characters
- ./extensions:/home/app/text-generation-webui/extensions
- ./loras:/home/app/text-generation-webui/loras
- ./logs:/home/app/text-generation-webui/logs
- ./models:/home/app/text-generation-webui/models
- ./presets:/home/app/text-generation-webui/presets
- ./prompts:/home/app/text-generation-webui/prompts
- ./softprompts:/home/app/text-generation-webui/softprompts
- ./training:/home/app/text-generation-webui/training
- ./cloudflared:/etc/cloudflared
- ./user_data:/home/app/text-generation-webui/user_data

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972}
APP_UID: ${APP_UID:-6972}
env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports:

View file

@ -14,7 +14,7 @@ WORKDIR /home/app/
RUN git clone https://github.com/oobabooga/text-generation-webui.git
WORKDIR /home/app/text-generation-webui
RUN GPU_CHOICE=D LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
COPY CMD_FLAGS.txt /home/app/text-generation-webui/
COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
# set umask to ensure group read / write at runtime
WORKDIR /home/app/text-generation-webui

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972}
APP_UID: ${APP_UID:-6972}
env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports:
@ -41,12 +41,4 @@ services:
security_opt:
- seccomp=unconfined
volumes:
- ./characters:/home/app/text-generation-webui/characters
- ./extensions:/home/app/text-generation-webui/extensions
- ./loras:/home/app/text-generation-webui/loras
- ./models:/home/app/text-generation-webui/models
- ./presets:/home/app/text-generation-webui/presets
- ./prompts:/home/app/text-generation-webui/prompts
- ./softprompts:/home/app/text-generation-webui/softprompts
- ./training:/home/app/text-generation-webui/training
- ./cloudflared:/etc/cloudflared
- ./user_data:/home/app/text-generation-webui/user_data

View file

@ -14,7 +14,7 @@ WORKDIR /home/app/
RUN git clone https://github.com/oobabooga/text-generation-webui.git
WORKDIR /home/app/text-generation-webui
RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
COPY CMD_FLAGS.txt /home/app/text-generation-webui/
COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
WORKDIR /home/app/text-generation-webui
# set umask to ensure group read / write at runtime

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972}
APP_UID: ${APP_UID:-6972}
env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports:
@ -31,17 +31,7 @@ services:
stdin_open: true
tty: true
volumes:
- ./cache:/home/app/text-generation-webui/cache
- ./characters:/home/app/text-generation-webui/characters
- ./extensions:/home/app/text-generation-webui/extensions
- ./loras:/home/app/text-generation-webui/loras
- ./logs:/home/app/text-generation-webui/logs
- ./models:/home/app/text-generation-webui/models
- ./presets:/home/app/text-generation-webui/presets
- ./prompts:/home/app/text-generation-webui/prompts
- ./softprompts:/home/app/text-generation-webui/softprompts
- ./training:/home/app/text-generation-webui/training
- ./cloudflared:/etc/cloudflared
- ./user_data:/home/app/text-generation-webui/user_data
deploy:
resources:
reservations:

View file

@ -257,6 +257,85 @@ headers = {
in any of the examples above.
#### Tool/Function Calling Example
You need to use a model with tools support. The prompt will be automatically formatted using the model's Jinja2 template.
Request:
```
curl http://127.0.0.1:5000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What time is it currently in New York City?"
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_time",
"description": "Get current time in a specific timezones",
"parameters": {
"type": "object",
"required": ["timezone"],
"properties": {
"timezone": {
"type": "string",
"description": "IANA timezone name (e.g., America/New_York, Europe/London). Use Europe/Berlin as local timezone if no timezone provided by the user."
}
}
}
}
}
]
}'
```
Sample response:
```
{
"id": "chatcmpl-1746532051477984256",
"object": "chat.completion",
"created": 1746532051,
"model": "qwen2.5-coder-14b-instruct-q4_k_m.gguf",
"choices": [
{
"index": 0,
"finish_reason": "tool_calls",
"message": {
"role": "assistant",
"content": "```xml\n<function>\n{\n \"name\": \"get_current_time\",\n \"arguments\": {\n \"timezone\": \"America/New_York\"\n }\n}\n</function>\n```"
},
"tool_calls": [
{
"type": "function",
"function": {
"name": "get_current_time",
"arguments": "{\"timezone\": \"America/New_York\"}"
},
"id": "call_52ij07mh",
"index": "0"
}
]
}
],
"usage": {
"prompt_tokens": 224,
"completion_tokens": 38,
"total_tokens": 262
}
}
```
### Environment variables
The following environment variables can be used (they take precedence over everything else):

View file

@ -1,16 +1,14 @@
import base64
import copy
import re
import json
import time
from collections import deque
from io import BytesIO
import requests
import tiktoken
from PIL import Image
from pydantic import ValidationError
from extensions.openai.errors import InvalidRequestError
from extensions.openai.utils import debug_msg
from extensions.openai.typing import ToolDefinition
from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
from modules import shared
from modules.chat import (
generate_chat_prompt,
@ -96,72 +94,32 @@ def convert_history(history):
user_input_last = True
system_message = ""
# Multimodal: convert OpenAI format to multimodal extension format
if any('content' in entry and isinstance(entry['content'], list) for entry in history):
new_history = []
for entry in history:
if isinstance(entry['content'], list):
for item in entry['content']:
if not isinstance(item, dict):
continue
image_url = None
content = None
if item['type'] == 'image_url' and isinstance(item['image_url'], dict):
image_url = item['image_url']['url']
elif item['type'] == 'text' and isinstance(item['text'], str):
content = item['text']
if image_url:
new_history.append({"image_url": image_url, "role": "user"})
if content:
new_history.append({"content": content, "role": "user"})
else:
new_history.append(entry)
history = new_history
for entry in history:
if "image_url" in entry:
image_url = entry['image_url']
if "base64" in image_url:
image_url = re.sub('^data:image/.+;base64,', '', image_url)
img = Image.open(BytesIO(base64.b64decode(image_url)))
else:
try:
my_res = requests.get(image_url)
img = Image.open(BytesIO(my_res.content))
except Exception:
raise 'Image cannot be loaded from the URL!'
buffered = BytesIO()
if img.mode in ("RGBA", "P"):
img = img.convert("RGB")
img.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
content = f'<img src="data:image/jpeg;base64,{img_str}">'
else:
content = entry["content"]
content = entry["content"]
role = entry["role"]
if role == "user":
user_input = content
user_input_last = True
if current_message:
chat_dialogue.append([current_message, ''])
chat_dialogue.append([current_message, '', ''])
current_message = ""
current_message = content
elif role == "assistant":
if "tool_calls" in entry and isinstance(entry["tool_calls"], list) and len(entry["tool_calls"]) > 0 and content.strip() == "":
continue # skip tool calls
current_reply = content
user_input_last = False
if current_message:
chat_dialogue.append([current_message, current_reply])
chat_dialogue.append([current_message, current_reply, ''])
current_message = ""
current_reply = ""
else:
chat_dialogue.append(['', current_reply])
chat_dialogue.append(['', current_reply, ''])
elif role == "tool":
user_input_last = False
chat_dialogue.append(['', '', content])
elif role == "system":
system_message += f"\n{content}" if system_message else content
@ -181,6 +139,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
if 'messages' not in body:
raise InvalidRequestError(message="messages is required", param='messages')
tools = None
if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and len(body['tools']) > 0:
tools = validateTools(body['tools']) # raises InvalidRequestError if validation fails
messages = body['messages']
for m in messages:
if 'role' not in m:
@ -238,6 +200,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
'custom_system_message': custom_system_message,
'chat_template_str': chat_template_str,
'chat-instruct_command': chat_instruct_command,
'tools': tools,
'history': history,
'stream': stream
})
@ -250,7 +213,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
requested_model = generate_params.pop('model')
logprob_proc = generate_params.pop('logprob_proc', None)
def chat_streaming_chunk(content):
def chat_streaming_chunk(content, chunk_tool_calls=None):
# begin streaming
chunk = {
"id": cmpl_id,
@ -260,7 +223,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
resp_list: [{
"index": 0,
"finish_reason": None,
"delta": {'role': 'assistant', 'content': content},
"delta": {'role': 'assistant', 'content': content, 'tool_calls': chunk_tool_calls},
}],
}
@ -269,6 +232,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
# else:
# chunk[resp_list][0]["logprobs"] = None
return chunk
# generate reply #######################################
@ -277,8 +241,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
yield {'prompt': prompt}
return
debug_msg({'prompt': prompt, 'generate_params': generate_params})
if stream:
yield chat_streaming_chunk('')
@ -288,8 +250,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
answer = ''
seen_content = ''
tool_calls = []
end_last_tool_call = 0
supported_tools = [x["function"]["name"] for x in tools] if tools is not None else None
for a in generator:
answer = a['internal'][-1][1]
if supported_tools is not None:
tool_call = parseToolCall(answer[end_last_tool_call:], supported_tools) if len(answer) > 0 else []
if len(tool_call) > 0:
for tc in tool_call:
tc["id"] = getToolCallId()
tc["index"] = str(len(tool_calls))
tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"])
tool_calls.append(tc)
end_last_tool_call = len(answer)
if stream:
len_seen = len(seen_content)
new_content = answer[len_seen:]
@ -297,18 +274,25 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet.
continue
seen_content = answer
chunk = chat_streaming_chunk(new_content)
seen_content = answer
yield chunk
# stop generation if tool_calls were generated previously
if len(tool_calls) > 0:
break
token_count = len(encode(prompt)[0])
completion_token_count = len(encode(answer)[0])
stop_reason = "stop"
if len(tool_calls) > 0:
stop_reason = "tool_calls"
if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']:
stop_reason = "length"
if stream:
chunk = chat_streaming_chunk('')
chunk = chat_streaming_chunk('', tool_calls)
chunk[resp_list][0]['finish_reason'] = stop_reason
chunk['usage'] = {
"prompt_tokens": token_count,
@ -326,7 +310,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
resp_list: [{
"index": 0,
"finish_reason": stop_reason,
"message": {"role": "assistant", "content": answer}
"message": {"role": "assistant", "content": answer},
"tool_calls": tool_calls
}],
"usage": {
"prompt_tokens": token_count,
@ -515,3 +500,19 @@ def completions(body: dict, is_legacy: bool = False) -> dict:
def stream_completions(body: dict, is_legacy: bool = False):
for resp in completions_common(body, is_legacy, stream=True):
yield resp
def validateTools(tools: list[dict]):
# Validate each tool definition in the JSON array
valid_tools = None
for idx in range(len(tools)):
tool = tools[idx]
try:
tool_definition = ToolDefinition(**tool)
if valid_tools is None:
valid_tools = []
valid_tools.append(tool)
except ValidationError:
raise InvalidRequestError(message=f"Invalid tool specification at index {idx}.", param='tools')
return valid_tools

View file

@ -14,6 +14,7 @@ from fastapi.requests import Request
from fastapi.responses import JSONResponse
from pydub import AudioSegment
from sse_starlette import EventSourceResponse
from starlette.concurrency import iterate_in_threadpool
import extensions.openai.completions as OAIcompletions
import extensions.openai.images as OAIimages
@ -114,18 +115,28 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
if request_data.stream:
async def generator():
async with streaming_semaphore:
response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
for resp in response:
disconnected = await request.is_disconnected()
if disconnected:
break
try:
response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
async for resp in iterate_in_threadpool(response):
disconnected = await request.is_disconnected()
if disconnected:
break
yield {"data": json.dumps(resp)}
yield {"data": json.dumps(resp)}
finally:
stop_everything_event()
response.close()
return
return EventSourceResponse(generator()) # SSE streaming
else:
response = OAIcompletions.completions(to_dict(request_data), is_legacy=is_legacy)
response = await asyncio.to_thread(
OAIcompletions.completions,
to_dict(request_data),
is_legacy=is_legacy
)
return JSONResponse(response)
@ -137,18 +148,28 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
if request_data.stream:
async def generator():
async with streaming_semaphore:
response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
for resp in response:
disconnected = await request.is_disconnected()
if disconnected:
break
try:
response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
async for resp in iterate_in_threadpool(response):
disconnected = await request.is_disconnected()
if disconnected:
break
yield {"data": json.dumps(resp)}
yield {"data": json.dumps(resp)}
finally:
stop_everything_event()
response.close()
return
return EventSourceResponse(generator()) # SSE streaming
else:
response = OAIcompletions.chat_completions(to_dict(request_data), is_legacy=is_legacy)
response = await asyncio.to_thread(
OAIcompletions.chat_completions,
to_dict(request_data),
is_legacy=is_legacy
)
return JSONResponse(response)
@ -436,7 +457,7 @@ def run_server():
# Start server
logging.getLogger("uvicorn.error").propagate = False
uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False)
def setup():

View file

@ -1,8 +1,8 @@
import json
import time
from typing import Dict, List
from typing import Dict, List, Optional
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, validator
class GenerationOptions(BaseModel):
@ -54,6 +54,48 @@ class GenerationOptions(BaseModel):
grammar_string: str = ""
class ToolDefinition(BaseModel):
function: 'ToolFunction'
type: str
class ToolFunction(BaseModel):
description: str
name: str
parameters: 'ToolParameters'
class ToolParameters(BaseModel):
properties: Optional[Dict[str, 'ToolProperty']] = None
required: Optional[list[str]] = None
type: str
description: Optional[str] = None
class ToolProperty(BaseModel):
description: Optional[str] = None
type: Optional[str] = None # we are faced with definitions like anyOf, e.g. {'type': 'function', 'function': {'name': 'git_create_branch', 'description': 'Creates a new branch from an optional base branch', 'parameters': {'type': 'object', 'properties': {'repo_path': {'title': 'Repo Path', 'type': 'string'}, 'branch_name': {'title': 'Branch Name', 'type': 'string'}, 'base_branch': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'default': None, 'title': 'Base Branch'}}, 'required': ['repo_path', 'branch_name'], 'title': 'GitCreateBranch'}}}
class FunctionCall(BaseModel):
name: str
arguments: Optional[str] = None
parameters: Optional[str] = None
@validator('arguments', allow_reuse=True)
def checkPropertyArgsOrParams(cls, v, values, **kwargs):
if not v and not values.get('parameters'):
raise ValueError("At least one of 'arguments' or 'parameters' must be provided as property in FunctionCall type")
return v
class ToolCall(BaseModel):
id: str
index: int
type: str
function: FunctionCall
class CompletionRequestParams(BaseModel):
model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.")
prompt: str | List[str]
@ -92,6 +134,7 @@ class ChatCompletionRequestParams(BaseModel):
frequency_penalty: float | None = 0
function_call: str | dict | None = Field(default=None, description="Unused parameter.")
functions: List[dict] | None = Field(default=None, description="Unused parameter.")
tools: List[dict] | None = Field(default=None, description="Tools signatures passed via MCP.")
logit_bias: dict | None = None
max_tokens: int | None = None
n: int | None = Field(default=1, description="Unused parameter.")

View file

@ -1,5 +1,8 @@
import base64
import json
import os
import random
import re
import time
import traceback
from typing import Callable, Optional
@ -52,3 +55,94 @@ def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_star
time.sleep(3)
raise Exception('Could not start cloudflared.')
def getToolCallId() -> str:
letter_bytes = "abcdefghijklmnopqrstuvwxyz0123456789"
b = [random.choice(letter_bytes) for _ in range(8)]
return "call_" + "".join(b).lower()
def checkAndSanitizeToolCallCandidate(candidate_dict: dict, tool_names: list[str]):
# check if property 'function' exists and is a dictionary, otherwise adapt dict
if 'function' not in candidate_dict and 'name' in candidate_dict and isinstance(candidate_dict['name'], str):
candidate_dict = {"type": "function", "function": candidate_dict}
if 'function' in candidate_dict and isinstance(candidate_dict['function'], str):
candidate_dict['name'] = candidate_dict['function']
del candidate_dict['function']
candidate_dict = {"type": "function", "function": candidate_dict}
if 'function' in candidate_dict and isinstance(candidate_dict['function'], dict):
# check if 'name' exists within 'function' and is part of known tools
if 'name' in candidate_dict['function'] and candidate_dict['function']['name'] in tool_names:
candidate_dict["type"] = "function" # ensure required property 'type' exists and has the right value
# map property 'parameters' used by some older models to 'arguments'
if "arguments" not in candidate_dict["function"] and "parameters" in candidate_dict["function"]:
candidate_dict["function"]["arguments"] = candidate_dict["function"]["parameters"]
del candidate_dict["function"]["parameters"]
return candidate_dict
return None
def parseToolCall(answer: str, tool_names: list[str]):
matches = []
# abort on very short answers to save computation cycles
if len(answer) < 10:
return matches
# Define the regex pattern to find the JSON content wrapped in <function>, <tools>, <tool_call>, and other tags observed from various models
patterns = [r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)</\1>"]
for pattern in patterns:
for match in re.finditer(pattern, answer, re.DOTALL):
# print(match.group(2))
if match.group(2) is None:
continue
# remove backtick wraps if present
candidate = re.sub(r"^```(json|xml|python[^\n]*)\n", "", match.group(2).strip())
candidate = re.sub(r"```$", "", candidate.strip())
# unwrap inner tags
candidate = re.sub(pattern, r"\2", candidate.strip(), flags=re.DOTALL)
# llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
if re.search(r"\}\s*\n\s*\{", candidate) is not None:
candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
if not candidate.strip().startswith("["):
candidate = "[" + candidate + "]"
candidates = []
try:
# parse the candidate JSON into a dictionary
candidates = json.loads(candidate)
if not isinstance(candidates, list):
candidates = [candidates]
except json.JSONDecodeError:
# Ignore invalid JSON silently
continue
for candidate_dict in candidates:
checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
if checked_candidate is not None:
matches.append(checked_candidate)
# last resort if nothing has been mapped: LLM might have produced plain json tool call without xml-like tags
if len(matches) == 0:
try:
candidate = answer
# llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
if re.search(r"\}\s*\n\s*\{", candidate) is not None:
candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
if not candidate.strip().startswith("["):
candidate = "[" + candidate + "]"
# parse the candidate JSON into a dictionary
candidates = json.loads(candidate)
if not isinstance(candidates, list):
candidates = [candidates]
for candidate_dict in candidates:
checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
if checked_candidate is not None:
matches.append(checked_candidate)
except json.JSONDecodeError:
# Ignore invalid JSON silently
pass
return matches

View file

@ -1,10 +1,11 @@
import math
import random
import threading
import torch
import chromadb
import numpy as np
import posthog
import torch
from chromadb.config import Settings
from chromadb.utils import embedding_functions
@ -292,6 +293,8 @@ class ChromaCollector():
for doc in documents:
doc_tokens = encode(doc)[0]
if isinstance(doc_tokens, np.ndarray):
doc_tokens = doc_tokens.tolist()
doc_token_count = len(doc_tokens)
if current_token_count + doc_token_count > max_token_count:
# If adding this document would exceed the max token count,

View file

@ -1,3 +1,7 @@
// -------------------------------------------------
// Event handlers
// -------------------------------------------------
function copyToClipboard(element) {
if (!element) return;
@ -18,6 +22,201 @@ function copyToClipboard(element) {
});
}
function branchHere(element) {
if (!element) return;
const messageElement = element.closest(".message, .user-message, .assistant-message");
if (!messageElement) return;
const index = messageElement.getAttribute("data-index");
if (!index) return;
const branchIndexInput = document.getElementById("Branch-index").querySelector("input");
if (!branchIndexInput) {
console.error("Element with ID 'Branch-index' not found.");
return;
}
const branchButton = document.getElementById("Branch");
if (!branchButton) {
console.error("Required element 'Branch' not found.");
return;
}
branchIndexInput.value = index;
// Trigger any 'change' or 'input' events Gradio might be listening for
const event = new Event("input", { bubbles: true });
branchIndexInput.dispatchEvent(event);
branchButton.click();
}
// -------------------------------------------------
// Message Editing Functions
// -------------------------------------------------
function editHere(buttonElement) {
if (!buttonElement) return;
const messageElement = buttonElement.closest(".message, .user-message, .assistant-message");
if (!messageElement) return;
const messageBody = messageElement.querySelector(".message-body");
if (!messageBody) return;
// If already editing, focus the textarea
const existingTextarea = messageBody.querySelector(".editing-textarea");
if (existingTextarea) {
existingTextarea.focus();
return;
}
// Determine role based on message element - handle different chat modes
const isUserMessage = messageElement.classList.contains("user-message") ||
messageElement.querySelector(".text-you") !== null ||
messageElement.querySelector(".circle-you") !== null;
startEditing(messageElement, messageBody, isUserMessage);
}
function startEditing(messageElement, messageBody, isUserMessage) {
const rawText = messageElement.getAttribute("data-raw") || messageBody.textContent;
const originalHTML = messageBody.innerHTML;
// Create editing interface
const editingInterface = createEditingInterface(rawText);
// Replace message content
messageBody.innerHTML = "";
messageBody.appendChild(editingInterface.textarea);
messageBody.appendChild(editingInterface.controls);
editingInterface.textarea.focus();
editingInterface.textarea.setSelectionRange(rawText.length, rawText.length);
// Setup event handlers
setupEditingHandlers(editingInterface.textarea, messageElement, originalHTML, messageBody, isUserMessage);
}
function createEditingInterface(text) {
const textarea = document.createElement("textarea");
textarea.value = text;
textarea.className = "editing-textarea";
textarea.rows = Math.max(3, text.split("\n").length);
const controls = document.createElement("div");
controls.className = "edit-controls-container";
const saveButton = document.createElement("button");
saveButton.textContent = "Save";
saveButton.className = "edit-control-button";
saveButton.type = "button";
const cancelButton = document.createElement("button");
cancelButton.textContent = "Cancel";
cancelButton.className = "edit-control-button edit-cancel-button";
cancelButton.type = "button";
controls.appendChild(saveButton);
controls.appendChild(cancelButton);
return { textarea, controls, saveButton, cancelButton };
}
function setupEditingHandlers(textarea, messageElement, originalHTML, messageBody, isUserMessage) {
const saveButton = messageBody.querySelector(".edit-control-button:not(.edit-cancel-button)");
const cancelButton = messageBody.querySelector(".edit-cancel-button");
const submitEdit = () => {
const index = messageElement.getAttribute("data-index");
if (!index || !submitMessageEdit(index, textarea.value, isUserMessage)) {
cancelEdit();
}
};
const cancelEdit = () => {
messageBody.innerHTML = originalHTML;
};
// Event handlers
saveButton.onclick = submitEdit;
cancelButton.onclick = cancelEdit;
textarea.onkeydown = (e) => {
if (e.key === "Enter" && !e.shiftKey) {
e.preventDefault();
submitEdit();
} else if (e.key === "Escape") {
e.preventDefault();
cancelEdit();
}
};
}
function submitMessageEdit(index, newText, isUserMessage) {
const editIndexInput = document.getElementById("Edit-message-index")?.querySelector("input");
const editTextInput = document.getElementById("Edit-message-text")?.querySelector("textarea");
const editRoleInput = document.getElementById("Edit-message-role")?.querySelector("textarea");
const editButton = document.getElementById("Edit-message");
if (!editIndexInput || !editTextInput || !editRoleInput || !editButton) {
console.error("Edit elements not found");
return false;
}
editIndexInput.value = index;
editTextInput.value = newText;
editRoleInput.value = isUserMessage ? "user" : "assistant";
editIndexInput.dispatchEvent(new Event("input", { bubbles: true }));
editTextInput.dispatchEvent(new Event("input", { bubbles: true }));
editRoleInput.dispatchEvent(new Event("input", { bubbles: true }));
editButton.click();
return true;
}
function navigateVersion(element, direction) {
if (!element) return;
const messageElement = element.closest(".message, .user-message, .assistant-message");
if (!messageElement) return;
const index = messageElement.getAttribute("data-index");
if (!index) return;
// Determine role based on message element classes
let role = "assistant"; // Default role
if (messageElement.classList.contains("user-message") ||
messageElement.querySelector(".text-you") ||
messageElement.querySelector(".circle-you")) {
role = "user";
}
const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input");
const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea");
const roleInput = document.getElementById("Navigate-message-role")?.querySelector("textarea");
const navigateButton = document.getElementById("Navigate-version");
if (!indexInput || !directionInput || !roleInput || !navigateButton) {
console.error("Navigation control elements (index, direction, role, or button) not found.");
return;
}
indexInput.value = index;
directionInput.value = direction;
roleInput.value = role;
// Trigger 'input' events for Gradio to pick up changes
const event = new Event("input", { bubbles: true });
indexInput.dispatchEvent(event);
directionInput.dispatchEvent(event);
roleInput.dispatchEvent(event);
navigateButton.click();
}
function regenerateClick() {
document.getElementById("Regenerate").click();
}

View file

@ -1,3 +1,7 @@
// ------------------------------------------------
// Main
// ------------------------------------------------
let main_parent = document.getElementById("chat-tab").parentNode;
let extensions = document.getElementById("extensions");
@ -39,9 +43,24 @@ document.querySelector(".header_bar").addEventListener("click", function(event)
//------------------------------------------------
// Keyboard shortcuts
//------------------------------------------------
// --- Helper functions --- //
function isModifiedKeyboardEvent() {
return (event instanceof KeyboardEvent &&
event.shiftKey ||
event.ctrlKey ||
event.altKey ||
event.metaKey);
}
function isFocusedOnEditableTextbox() {
if (event.target.tagName === "INPUT" || event.target.tagName === "TEXTAREA") {
return !!event.target.value;
}
}
let previousTabId = "chat-tab-button";
document.addEventListener("keydown", function(event) {
// Stop generation on Esc pressed
if (event.key === "Escape") {
// Find the element with id 'stop' and click it
@ -49,10 +68,15 @@ document.addEventListener("keydown", function(event) {
if (stopButton) {
stopButton.click();
}
return;
}
if (!document.querySelector("#chat-tab").checkVisibility() ) {
return;
}
// Show chat controls on Ctrl + S
else if (event.ctrlKey && event.key == "s") {
if (event.ctrlKey && event.key == "s") {
event.preventDefault();
var showControlsElement = document.getElementById("show-controls");
@ -82,24 +106,29 @@ document.addEventListener("keydown", function(event) {
document.getElementById("Remove-last").click();
}
// Copy last on Ctrl + Shift + K
else if (event.ctrlKey && event.shiftKey && event.key === "K") {
event.preventDefault();
document.getElementById("Copy-last").click();
}
// Replace last on Ctrl + Shift + L
else if (event.ctrlKey && event.shiftKey && event.key === "L") {
event.preventDefault();
document.getElementById("Replace-last").click();
}
// Impersonate on Ctrl + Shift + M
else if (event.ctrlKey && event.shiftKey && event.key === "M") {
event.preventDefault();
document.getElementById("Impersonate").click();
}
// --- Simple version navigation --- //
if (!isFocusedOnEditableTextbox()) {
// Version navigation on Arrow keys (horizontal)
if (!isModifiedKeyboardEvent() && event.key === "ArrowLeft") {
event.preventDefault();
navigateLastAssistantMessage("left");
}
else if (!isModifiedKeyboardEvent() && event.key === "ArrowRight") {
event.preventDefault();
if (!navigateLastAssistantMessage("right")) {
// If can't navigate right (last version), regenerate
document.getElementById("Regenerate").click();
}
}
}
});
//------------------------------------------------
@ -132,8 +161,6 @@ targetElement.addEventListener("scroll", function() {
// Create a MutationObserver instance
const observer = new MutationObserver(function(mutations) {
updateCssProperties();
if (targetElement.classList.contains("_generating")) {
typing.parentNode.classList.add("visible-dots");
document.getElementById("stop").style.display = "flex";
@ -144,12 +171,24 @@ const observer = new MutationObserver(function(mutations) {
document.getElementById("Generate").style.display = "flex";
}
doSyntaxHighlighting();
if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) {
targetElement.scrollTop = targetElement.scrollHeight;
}
const chatElement = document.getElementById("chat");
if (chatElement && chatElement.getAttribute("data-mode") === "instruct") {
const messagesContainer = chatElement.querySelector(".messages");
const lastChild = messagesContainer?.lastElementChild;
const prevSibling = lastChild?.previousElementSibling;
if (lastChild && prevSibling) {
lastChild.style.setProperty("margin-bottom",
`max(0px, calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px))`,
"important"
);
}
}
});
// Configure the observer to watch for changes in the subtree and attributes
@ -436,38 +475,6 @@ const chatInput = document.querySelector("#chat-input textarea");
// Variables to store current dimensions
let currentChatInputHeight = chatInput.clientHeight;
// Update chat layout based on chat and input dimensions
function updateCssProperties() {
const chatInputHeight = chatInput.clientHeight;
// Check if the chat container is visible
if (chatContainer.clientHeight > 0) {
const chatContainerParentHeight = chatContainer.parentNode.clientHeight;
const newChatHeight = `${chatContainerParentHeight - chatInputHeight - 80}px`;
document.documentElement.style.setProperty("--chat-height", newChatHeight);
document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`);
// Adjust scrollTop based on input height change
if (chatInputHeight !== currentChatInputHeight) {
const deltaHeight = chatInputHeight - currentChatInputHeight;
if (!isScrolled && deltaHeight < 0) {
chatContainer.scrollTop = chatContainer.scrollHeight;
} else {
chatContainer.scrollTop += deltaHeight;
}
currentChatInputHeight = chatInputHeight;
}
}
}
// Observe textarea size changes and call update function
new ResizeObserver(updateCssProperties).observe(document.querySelector("#chat-input textarea"));
// Handle changes in window size
window.addEventListener("resize", updateCssProperties);
//------------------------------------------------
// Focus on the rename text area when it becomes visible
//------------------------------------------------
@ -720,7 +727,7 @@ function isMobile() {
// Function to initialize sidebars
function initializeSidebars() {
const isOnMobile = isMobile();
if (isOnMobile) {
// Mobile state: Hide sidebars and set closed states
[pastChatsRow, chatControlsRow, headerBar].forEach(el => {
@ -813,3 +820,55 @@ function createMobileTopBar() {
}
createMobileTopBar();
//------------------------------------------------
// Simple Navigation Functions
//------------------------------------------------
function navigateLastAssistantMessage(direction) {
const chat = document.querySelector("#chat");
if (!chat) return false;
const messages = chat.querySelectorAll("[data-index]");
if (messages.length === 0) return false;
// Find the last assistant message (starting from the end)
let lastAssistantMessage = null;
for (let i = messages.length - 1; i >= 0; i--) {
const msg = messages[i];
if (
msg.classList.contains("assistant-message") ||
msg.querySelector(".circle-bot") ||
msg.querySelector(".text-bot")
) {
lastAssistantMessage = msg;
break;
}
}
if (!lastAssistantMessage) return false;
const buttons = lastAssistantMessage.querySelectorAll(".version-nav-button");
for (let i = 0; i < buttons.length; i++) {
const button = buttons[i];
const onclick = button.getAttribute("onclick");
const disabled = button.hasAttribute("disabled");
const isLeft = onclick && onclick.includes("'left'");
const isRight = onclick && onclick.includes("'right'");
if (!disabled) {
if (direction === "left" && isLeft) {
navigateVersion(button, direction);
return true;
}
if (direction === "right" && isRight) {
navigateVersion(button, direction);
return true;
}
}
}
return false;
}

View file

@ -5,6 +5,7 @@ import html
import json
import pprint
import re
import time
from datetime import datetime
from functools import partial
from pathlib import Path
@ -30,12 +31,37 @@ from modules.text_generation import (
get_max_prompt_length
)
from modules.utils import delete_file, get_available_characters, save_file
from modules.web_search import add_web_search_attachments
def strftime_now(format):
return datetime.now().strftime(format)
def get_current_timestamp():
"""Returns the current time in 24-hour format"""
return datetime.now().strftime('%b %d, %Y %H:%M')
def update_message_metadata(metadata_dict, role, index, **fields):
"""
Updates or adds metadata fields for a specific message.
Args:
metadata_dict: The metadata dictionary
role: The role (user, assistant, etc)
index: The message index
**fields: Arbitrary metadata fields to update/add
"""
key = f"{role}_{index}"
if key not in metadata_dict:
metadata_dict[key] = {}
# Update with provided fields
for field_name, field_value in fields.items():
metadata_dict[key][field_name] = field_value
jinja_env = ImmutableSandboxedEnvironment(
trim_blocks=True,
lstrip_blocks=True,
@ -132,7 +158,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
impersonate = kwargs.get('impersonate', False)
_continue = kwargs.get('_continue', False)
also_return_rows = kwargs.get('also_return_rows', False)
history = kwargs.get('history', state['history'])['internal']
history_data = kwargs.get('history', state['history'])
history = history_data['internal']
metadata = history_data.get('metadata', {})
# Templates
chat_template_str = state['chat_template_str']
@ -145,7 +173,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
instruct_renderer = partial(
instruction_template.render,
builtin_tools=None,
tools=None,
tools=state['tools'] if 'tools' in state else None,
tools_in_user_message=False,
add_generation_prompt=False
)
@ -171,18 +199,62 @@ def generate_chat_prompt(user_input, state, **kwargs):
messages.append({"role": "system", "content": context})
insert_pos = len(messages)
for user_msg, assistant_msg in reversed(history):
user_msg = user_msg.strip()
assistant_msg = assistant_msg.strip()
for i, entry in enumerate(reversed(history)):
user_msg = entry[0].strip()
assistant_msg = entry[1].strip()
tool_msg = entry[2].strip() if len(entry) > 2 else ''
row_idx = len(history) - i - 1
if tool_msg:
messages.insert(insert_pos, {"role": "tool", "content": tool_msg})
if assistant_msg:
messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']:
messages.insert(insert_pos, {"role": "user", "content": user_msg})
# Check for user message attachments in metadata
user_key = f"user_{row_idx}"
enhanced_user_msg = user_msg
# Add attachment content if present
if user_key in metadata and "attachments" in metadata[user_key]:
attachments_text = ""
for attachment in metadata[user_key]["attachments"]:
filename = attachment.get("name", "file")
content = attachment.get("content", "")
attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
if attachments_text:
enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}"
messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg})
user_input = user_input.strip()
if user_input and not impersonate and not _continue:
# Check if we have attachments even with empty input
has_attachments = False
if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
current_row_idx = len(history)
user_key = f"user_{current_row_idx}"
has_attachments = user_key in metadata and "attachments" in metadata[user_key]
if (user_input or has_attachments) and not impersonate and not _continue:
# For the current user input being processed, check if we need to add attachments
if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
current_row_idx = len(history)
user_key = f"user_{current_row_idx}"
if user_key in metadata and "attachments" in metadata[user_key]:
attachments_text = ""
for attachment in metadata[user_key]["attachments"]:
filename = attachment.get("name", "file")
content = attachment.get("content", "")
attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
if attachments_text:
user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}"
messages.append({"role": "user", "content": user_input})
def make_prompt(messages):
@ -251,7 +323,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
# Resort to truncating the user input
else:
user_message = messages[-1]['content']
# Bisect the truncation point
@ -288,6 +359,50 @@ def generate_chat_prompt(user_input, state, **kwargs):
return prompt
def count_prompt_tokens(text_input, state):
"""Count tokens for current history + input including attachments"""
if shared.tokenizer is None:
return "Tokenizer not available"
try:
# Handle dict format with text and files
files = []
if isinstance(text_input, dict):
files = text_input.get('files', [])
text = text_input.get('text', '')
else:
text = text_input
files = []
# Create temporary history copy to add attachments
temp_history = copy.deepcopy(state['history'])
if 'metadata' not in temp_history:
temp_history['metadata'] = {}
# Process attachments if any
if files:
row_idx = len(temp_history['internal'])
for file_path in files:
add_message_attachment(temp_history, row_idx, file_path, is_user=True)
# Create temp state with modified history
temp_state = copy.deepcopy(state)
temp_state['history'] = temp_history
# Build prompt using existing logic
prompt = generate_chat_prompt(text, temp_state)
current_tokens = get_encoded_length(prompt)
max_tokens = temp_state['truncation_length']
percentage = (current_tokens / max_tokens) * 100 if max_tokens > 0 else 0
return f"History + Input:<br/>{current_tokens:,} / {max_tokens:,} tokens ({percentage:.1f}%)"
except Exception as e:
logger.error(f"Error counting tokens: {e}")
return f"Error: {str(e)}"
def get_stopping_strings(state):
stopping_strings = []
renderers = []
@ -336,58 +451,212 @@ def get_stopping_strings(state):
return result
def add_message_version(history, role, row_idx, is_current=True):
key = f"{role}_{row_idx}"
if 'metadata' not in history:
history['metadata'] = {}
if key not in history['metadata']:
history['metadata'][key] = {}
if "versions" not in history['metadata'][key]:
history['metadata'][key]["versions"] = []
# Determine which index to use for content based on role
content_idx = 0 if role == 'user' else 1
current_content = history['internal'][row_idx][content_idx]
current_visible = history['visible'][row_idx][content_idx]
history['metadata'][key]["versions"].append({
"content": current_content,
"visible_content": current_visible,
"timestamp": get_current_timestamp()
})
if is_current:
# Set the current_version_index to the newly added version (which is now the last one).
history['metadata'][key]["current_version_index"] = len(history['metadata'][key]["versions"]) - 1
def add_message_attachment(history, row_idx, file_path, is_user=True):
"""Add a file attachment to a message in history metadata"""
if 'metadata' not in history:
history['metadata'] = {}
key = f"{'user' if is_user else 'assistant'}_{row_idx}"
if key not in history['metadata']:
history['metadata'][key] = {"timestamp": get_current_timestamp()}
if "attachments" not in history['metadata'][key]:
history['metadata'][key]["attachments"] = []
# Get file info using pathlib
path = Path(file_path)
filename = path.name
file_extension = path.suffix.lower()
try:
# Handle different file types
if file_extension == '.pdf':
# Process PDF file
content = extract_pdf_text(path)
file_type = "application/pdf"
else:
# Default handling for text files
with open(path, 'r', encoding='utf-8') as f:
content = f.read()
file_type = "text/plain"
# Add attachment
attachment = {
"name": filename,
"type": file_type,
"content": content,
}
history['metadata'][key]["attachments"].append(attachment)
return content # Return the content for reuse
except Exception as e:
logger.error(f"Error processing attachment {filename}: {e}")
return None
def extract_pdf_text(pdf_path):
"""Extract text from a PDF file"""
import PyPDF2
text = ""
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n\n"
return text.strip()
except Exception as e:
logger.error(f"Error extracting text from PDF: {e}")
return f"[Error extracting PDF text: {str(e)}]"
def generate_search_query(user_message, state):
"""Generate a search query from user message using the LLM"""
# Augment the user message with search instruction
augmented_message = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else."
# Use a minimal state for search query generation but keep the full history
search_state = state.copy()
search_state['max_new_tokens'] = 64
search_state['auto_max_new_tokens'] = False
search_state['enable_thinking'] = False
# Generate the full prompt using existing history + augmented message
formatted_prompt = generate_chat_prompt(augmented_message, search_state)
query = ""
for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True):
query = reply.strip()
return query
def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False):
# Handle dict format with text and files
files = []
if isinstance(text, dict):
files = text.get('files', [])
text = text.get('text', '')
history = state['history']
output = copy.deepcopy(history)
output = apply_extensions('history', output)
state = apply_extensions('state', state)
# Initialize metadata if not present
if 'metadata' not in output:
output['metadata'] = {}
visible_text = None
stopping_strings = get_stopping_strings(state)
is_stream = state['stream']
# Prepare the input
if not (regenerate or _continue):
visible_text = html.escape(text)
# Process file attachments and store in metadata
row_idx = len(output['internal'])
# Add attachments to metadata only, not modifying the message text
for file_path in files:
add_message_attachment(output, row_idx, file_path, is_user=True)
# Add web search results as attachments if enabled
if state.get('enable_web_search', False):
search_query = generate_search_query(text, state)
add_web_search_attachments(output, row_idx, text, search_query, state)
# Apply extensions
text, visible_text = apply_extensions('chat_input', text, visible_text, state)
text = apply_extensions('input', text, state, is_chat=True)
output['internal'].append([text, ''])
output['visible'].append([visible_text, ''])
# Add metadata with timestamp
update_message_metadata(output['metadata'], "user", row_idx, timestamp=get_current_timestamp())
# *Is typing...*
if loading_message:
yield {
'visible': output['visible'][:-1] + [[output['visible'][-1][0], shared.processing_message]],
'internal': output['internal']
'internal': output['internal'],
'metadata': output['metadata']
}
else:
text, visible_text = output['internal'][-1][0], output['visible'][-1][0]
if regenerate:
row_idx = len(output['internal']) - 1
# Store the old response as a version before regenerating
if not output['metadata'].get(f"assistant_{row_idx}", {}).get('versions'):
add_message_version(output, "assistant", row_idx, is_current=False)
# Add new empty version (will be filled during streaming)
key = f"assistant_{row_idx}"
output['metadata'][key]["versions"].append({
"content": "",
"visible_content": "",
"timestamp": get_current_timestamp()
})
output['metadata'][key]["current_version_index"] = len(output['metadata'][key]["versions"]) - 1
if loading_message:
yield {
'visible': output['visible'][:-1] + [[visible_text, shared.processing_message]],
'internal': output['internal'][:-1] + [[text, '']]
'internal': output['internal'][:-1] + [[text, '']],
'metadata': output['metadata']
}
elif _continue:
last_reply = [output['internal'][-1][1], output['visible'][-1][1]]
if loading_message:
yield {
'visible': output['visible'][:-1] + [[visible_text, last_reply[1] + '...']],
'internal': output['internal']
'internal': output['internal'],
'metadata': output['metadata']
}
# Generate the prompt
kwargs = {
'_continue': _continue,
'history': output if _continue else {k: v[:-1] for k, v in output.items()}
'history': output if _continue else {
k: (v[:-1] if k in ['internal', 'visible'] else v)
for k, v in output.items()
}
}
prompt = apply_extensions('custom_generate_chat_prompt', text, state, **kwargs)
if prompt is None:
prompt = generate_chat_prompt(text, state, **kwargs)
# Add timestamp for assistant's response at the start of generation
row_idx = len(output['internal']) - 1
update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp())
# Generate
reply = None
for j, reply in enumerate(generate_reply(prompt, state, stopping_strings=stopping_strings, is_chat=True, for_ui=for_ui)):
@ -402,16 +671,11 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
# Extract the reply
if state['mode'] in ['chat', 'chat-instruct']:
visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply + '')
visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
else:
visible_reply = reply + ''
visible_reply = reply
visible_reply = html.escape(visible_reply)
if shared.stop_everything:
if output['visible'][-1][1].endswith(''):
output['visible'][-1][1] = output['visible'][-1][1][:-1]
output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
yield output
return
@ -420,21 +684,21 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
# Separate already existing content from new content
original_internal = output['internal'][-1][1]
original_visible = output['visible'][-1][1]
# Get only the new generated part
new_content = reply[len(original_internal):] if reply.startswith(original_internal) else reply
new_content = new_content.lstrip()
# Translate only the new part
translated_new = apply_extensions('output', new_content, state, is_chat=True)
# Update both internal and visible versions
updated_internal = original_internal + " " + new_content
updated_visible = original_visible + " " + translated_new
output['internal'][-1] = [text, updated_internal]
output['visible'][-1] = [visible_text, updated_visible]
if is_stream:
yield output
elif not (j == 0 and visible_reply.strip() == ''):
@ -442,25 +706,45 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
translated_reply = apply_extensions('output', visible_reply.lstrip(' '), state, is_chat=True)
output['internal'][-1] = [text, reply.lstrip(' ')]
output['visible'][-1] = [visible_text, translated_reply]
# Keep version metadata in sync during streaming (for regeneration)
if regenerate:
row_idx = len(output['internal']) - 1
key = f"assistant_{row_idx}"
current_idx = output['metadata'][key]['current_version_index']
output['metadata'][key]['versions'][current_idx].update({
'content': output['internal'][row_idx][1],
'visible_content': output['visible'][row_idx][1]
})
if is_stream:
yield output
if output['visible'][-1][1].endswith(''):
output['visible'][-1][1] = output['visible'][-1][1][:-1]
# Final sync for version metadata (in case streaming was disabled)
if regenerate:
row_idx = len(output['internal']) - 1
key = f"assistant_{row_idx}"
current_idx = output['metadata'][key]['current_version_index']
output['metadata'][key]['versions'][current_idx].update({
'content': output['internal'][row_idx][1],
'visible_content': output['visible'][row_idx][1]
})
yield output
def impersonate_wrapper(text, state):
def impersonate_wrapper(textbox, state):
text = textbox['text']
static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
prompt = generate_chat_prompt('', state, impersonate=True)
stopping_strings = get_stopping_strings(state)
yield text + '...', static_output
textbox['text'] = text + '...'
yield textbox, static_output
reply = None
for reply in generate_reply(prompt + text, state, stopping_strings=stopping_strings, is_chat=True):
yield (text + reply).lstrip(' '), static_output
textbox['text'] = (text + reply).lstrip(' ')
yield textbox, static_output
if shared.stop_everything:
return
@ -506,56 +790,81 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
send_dummy_reply(state['start_with'], state)
history = state['history']
last_save_time = time.monotonic()
save_interval = 8
for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)):
yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
current_time = time.monotonic()
# Save on first iteration or if save_interval seconds have passed
if i == 0 or (current_time - last_save_time) >= save_interval:
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
last_save_time = current_time
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
def remove_last_message(history):
if 'metadata' not in history:
history['metadata'] = {}
if len(history['visible']) > 0 and history['internal'][-1][0] != '<|BEGIN-VISIBLE-CHAT|>':
row_idx = len(history['internal']) - 1
last = history['visible'].pop()
history['internal'].pop()
# Remove metadata directly by known keys
if f"user_{row_idx}" in history['metadata']:
del history['metadata'][f"user_{row_idx}"]
if f"assistant_{row_idx}" in history['metadata']:
del history['metadata'][f"assistant_{row_idx}"]
else:
last = ['', '']
return html.unescape(last[0]), history
def send_last_reply_to_input(history):
if len(history['visible']) > 0:
return html.unescape(history['visible'][-1][1])
else:
return ''
def replace_last_reply(text, state):
history = state['history']
if len(text.strip()) == 0:
return history
elif len(history['visible']) > 0:
history['visible'][-1][1] = html.escape(text)
history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
return history
def send_dummy_message(text, state):
history = state['history']
# Handle both dict and string inputs
if isinstance(text, dict):
text = text['text']
# Initialize metadata if not present
if 'metadata' not in history:
history['metadata'] = {}
row_idx = len(history['internal'])
history['visible'].append([html.escape(text), ''])
history['internal'].append([apply_extensions('input', text, state, is_chat=True), ''])
update_message_metadata(history['metadata'], "user", row_idx, timestamp=get_current_timestamp())
return history
def send_dummy_reply(text, state):
history = state['history']
# Handle both dict and string inputs
if isinstance(text, dict):
text = text['text']
# Initialize metadata if not present
if 'metadata' not in history:
history['metadata'] = {}
if len(history['visible']) > 0 and not history['visible'][-1][1] == '':
row_idx = len(history['internal'])
history['visible'].append(['', ''])
history['internal'].append(['', ''])
# We don't need to add system metadata
row_idx = len(history['internal']) - 1
history['visible'][-1][1] = html.escape(text)
history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
update_message_metadata(history['metadata'], "assistant", row_idx, timestamp=get_current_timestamp())
return history
@ -565,7 +874,8 @@ def redraw_html(history, name1, name2, mode, style, character, reset_cache=False
def start_new_chat(state):
mode = state['mode']
history = {'internal': [], 'visible': []}
# Initialize with empty metadata dictionary
history = {'internal': [], 'visible': [], 'metadata': {}}
if mode != 'instruct':
greeting = replace_character_names(state['greeting'], state['name1'], state['name2'])
@ -573,6 +883,9 @@ def start_new_chat(state):
history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]]
history['visible'] += [['', apply_extensions('output', html.escape(greeting), state, is_chat=True)]]
# Add timestamp for assistant's greeting
update_message_metadata(history['metadata'], "assistant", 0, timestamp=get_current_timestamp())
unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
save_history(history, unique_id, state['character_menu'], state['mode'])
@ -753,6 +1066,16 @@ def load_history(unique_id, character, mode):
'visible': f['data_visible']
}
# Add metadata if it doesn't exist
if 'metadata' not in history:
history['metadata'] = {}
# Add placeholder timestamps for existing messages
for i, (user_msg, asst_msg) in enumerate(history['internal']):
if user_msg and user_msg != '<|BEGIN-VISIBLE-CHAT|>':
update_message_metadata(history['metadata'], "user", i, timestamp="")
if asst_msg:
update_message_metadata(history['metadata'], "assistant", i, timestamp="")
return history
@ -768,6 +1091,16 @@ def load_history_json(file, history):
'visible': f['data_visible']
}
# Add metadata if it doesn't exist
if 'metadata' not in history:
history['metadata'] = {}
# Add placeholder timestamps
for i, (user_msg, asst_msg) in enumerate(history['internal']):
if user_msg and user_msg != '<|BEGIN-VISIBLE-CHAT|>':
update_message_metadata(history['metadata'], "user", i, timestamp="")
if asst_msg:
update_message_metadata(history['metadata'], "assistant", i, timestamp="")
return history
except:
return history
@ -1089,20 +1422,12 @@ def my_yaml_output(data):
return result
def handle_replace_last_reply_click(text, state):
history = replace_last_reply(text, state)
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html, ""]
def handle_send_dummy_message_click(text, state):
history = send_dummy_message(text, state)
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html, ""]
return [history, html, {"text": "", "files": []}]
def handle_send_dummy_reply_click(text, state):
@ -1110,7 +1435,7 @@ def handle_send_dummy_reply_click(text, state):
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html, ""]
return [history, html, {"text": "", "files": []}]
def handle_remove_last_click(state):
@ -1118,7 +1443,7 @@ def handle_remove_last_click(state):
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html, last_input]
return [history, html, {"text": last_input, "files": []}]
def handle_unique_id_select(state):
@ -1164,7 +1489,13 @@ def handle_delete_chat_confirm_click(state):
def handle_branch_chat_click(state):
history = state['history']
branch_from_index = state['branch_index']
if branch_from_index == -1:
history = state['history']
else:
history = state['history']
history['visible'] = history['visible'][:branch_from_index + 1]
history['internal'] = history['internal'][:branch_from_index + 1]
new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
save_history(history, new_unique_id, state['character_menu'], state['mode'])
@ -1175,7 +1506,93 @@ def handle_branch_chat_click(state):
past_chats_update = gr.update(choices=histories, value=new_unique_id)
return [history, html, past_chats_update]
return [history, html, past_chats_update, -1]
def handle_edit_message_click(state):
history = state['history']
message_index = int(state['edit_message_index'])
new_text = state['edit_message_text']
role = state['edit_message_role'] # "user" or "assistant"
if message_index >= len(history['internal']):
html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html_output]
role_idx = 0 if role == "user" else 1
if 'metadata' not in history:
history['metadata'] = {}
key = f"{role}_{message_index}"
if key not in history['metadata']:
history['metadata'][key] = {}
# If no versions exist yet for this message, store the current (pre-edit) content as the first version.
if "versions" not in history['metadata'][key] or not history['metadata'][key]["versions"]:
original_content = history['internal'][message_index][role_idx]
original_visible = history['visible'][message_index][role_idx]
original_timestamp = history['metadata'][key].get('timestamp', get_current_timestamp())
history['metadata'][key]["versions"] = [{
"content": original_content,
"visible_content": original_visible,
"timestamp": original_timestamp
}]
history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True)
history['visible'][message_index][role_idx] = html.escape(new_text)
add_message_version(history, role, message_index, is_current=True)
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html_output]
def handle_navigate_version_click(state):
history = state['history']
message_index = int(state['navigate_message_index'])
direction = state['navigate_direction']
role = state['navigate_message_role']
if not role:
logger.error("Role not provided for version navigation.")
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html]
key = f"{role}_{message_index}"
if 'metadata' not in history or key not in history['metadata'] or 'versions' not in history['metadata'][key]:
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html]
metadata = history['metadata'][key]
versions = metadata['versions']
# Default to the last version if current_version_index is not set
current_idx = metadata.get('current_version_index', len(versions) - 1 if versions else 0)
if direction == 'left':
new_idx = max(0, current_idx - 1)
else: # right
new_idx = min(len(versions) - 1, current_idx + 1)
if new_idx == current_idx:
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
return [history, html]
msg_content_idx = 0 if role == 'user' else 1 # 0 for user content, 1 for assistant content in the pair
version_to_load = versions[new_idx]
history['internal'][message_index][msg_content_idx] = version_to_load['content']
history['visible'][message_index][msg_content_idx] = version_to_load['visible_content']
metadata['current_version_index'] = new_idx
update_message_metadata(history['metadata'], role, message_index, timestamp=version_to_load['timestamp'])
# Redraw and save
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
return [history, html]
def handle_rename_chat_click():
@ -1317,7 +1734,7 @@ def handle_your_picture_change(picture, state):
def handle_send_instruction_click(state):
state['mode'] = 'instruct'
state['history'] = {'internal': [], 'visible': []}
state['history'] = {'internal': [], 'visible': [], 'metadata': {}}
output = generate_chat_prompt("Input", state)

View file

@ -119,7 +119,7 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
reset = True
# Maximum number of tokens to process in a single forward pass
max_chunk_size = 2048
max_chunk_size = 256
# Make the forward call
if labels is None:
@ -245,3 +245,20 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
return Exllamav3HF(pretrained_model_name_or_path)
def unload(self):
"""Properly unload the ExllamaV3 model and free GPU memory."""
if hasattr(self, 'ex_model') and self.ex_model is not None:
self.ex_model.unload()
self.ex_model = None
if hasattr(self, 'ex_cache') and self.ex_cache is not None:
self.ex_cache = None
# Clean up any additional ExllamaV3 resources
if hasattr(self, 'past_seq'):
self.past_seq = None
if hasattr(self, 'past_seq_negative'):
self.past_seq_negative = None
if hasattr(self, 'ex_cache_negative'):
self.ex_cache_negative = None

View file

@ -169,11 +169,7 @@ def convert_to_markdown(string, message_id=None):
thinking_block = f'''
<details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
<summary class="thinking-header">
<svg class="thinking-icon" width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
<path d="M8 1.33334C4.31868 1.33334 1.33334 4.31868 1.33334 8.00001C1.33334 11.6813 4.31868 14.6667 8 14.6667C11.6813 14.6667 14.6667 11.6813 14.6667 8.00001C14.6667 4.31868 11.6813 1.33334 8 1.33334Z" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
<path d="M8 10.6667V8.00001" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
<path d="M8 5.33334H8.00667" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
</svg>
{info_svg_small}
<span class="thinking-title">{title_text}</span>
</summary>
<div class="thinking-content pretty_scrollbar">{thinking_html}</div>
@ -339,41 +335,164 @@ copy_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" vie
refresh_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-repeat"><path d="M4 12v-3a3 3 0 0 1 3 -3h13m-3 -3l3 3l-3 3"></path><path d="M20 12v3a3 3 0 0 1 -3 3h-13m3 3l-3 -3l3 -3"></path></svg>'''
continue_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon icon-tabler icons-tabler-outline icon-tabler-player-play"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M7 4v16l13 -8z" /></svg>'''
remove_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon icon-tabler icons-tabler-outline icon-tabler-trash"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M4 7l16 0" /><path d="M10 11l0 6" /><path d="M14 11l0 6" /><path d="M5 7l1 12a2 2 0 0 0 2 2h8a2 2 0 0 0 2 -2l1 -12" /><path d="M9 7v-3a1 1 0 0 1 1 -1h4a1 1 0 0 1 1 1v3" /></svg>'''
branch_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="icon icon-tabler icons-tabler-outline icon-tabler-git-branch"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M7 18m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M7 6m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M17 6m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M7 8l0 8" /><path d="M9 18h6a2 2 0 0 0 2 -2v-5" /><path d="M14 14l3 -3l3 3" /></svg>'''
edit_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-pencil"><path d="M4 20h4l10.5 -10.5a2.828 2.828 0 1 0 -4 -4l-10.5 10.5v4"></path><path d="M13.5 6.5l4 4"></path></svg>'''
info_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
info_svg_small = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
attachment_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21.44 11.05l-9.19 9.19a6 6 0 0 1-8.48-8.48l9.19-9.19a4 4 0 0 1 5.66 5.66l-9.2 9.19a2 2 0 0 1-2.83-2.83l8.49-8.48"></path></svg>'''
copy_button = f'<button class="footer-button footer-copy-button" title="Copy" onclick="copyToClipboard(this)">{copy_svg}</button>'
branch_button = f'<button class="footer-button footer-branch-button" title="Branch here" onclick="branchHere(this)">{branch_svg}</button>'
edit_button = f'<button class="footer-button footer-edit-button" title="Edit" onclick="editHere(this)">{edit_svg}</button>'
refresh_button = f'<button class="footer-button footer-refresh-button" title="Regenerate" onclick="regenerateClick()">{refresh_svg}</button>'
continue_button = f'<button class="footer-button footer-continue-button" title="Continue" onclick="continueClick()">{continue_svg}</button>'
remove_button = f'<button class="footer-button footer-remove-button" title="Remove last reply" onclick="removeLastClick()">{remove_svg}</button>'
info_button = f'<button class="footer-button footer-info-button" title="message">{info_svg}</button>'
def format_message_timestamp(history, role, index):
"""Get a formatted timestamp HTML span for a message if available"""
key = f"{role}_{index}"
if 'metadata' in history and key in history['metadata'] and history['metadata'][key].get('timestamp'):
timestamp = history['metadata'][key]['timestamp']
return f"<span class='timestamp'>{timestamp}</span>"
return ""
def format_message_attachments(history, role, index):
"""Get formatted HTML for message attachments if available"""
key = f"{role}_{index}"
if 'metadata' in history and key in history['metadata'] and 'attachments' in history['metadata'][key]:
attachments = history['metadata'][key]['attachments']
if not attachments:
return ""
attachments_html = '<div class="message-attachments">'
for attachment in attachments:
name = html.escape(attachment["name"])
# Make clickable if URL exists
if "url" in attachment:
name = f'<a href="{html.escape(attachment["url"])}" target="_blank" rel="noopener noreferrer">{name}</a>'
attachments_html += (
f'<div class="attachment-box">'
f'<div class="attachment-icon">{attachment_svg}</div>'
f'<div class="attachment-name">{name}</div>'
f'</div>'
)
attachments_html += '</div>'
return attachments_html
return ""
def get_version_navigation_html(history, i, role):
"""Generate simple navigation arrows for message versions"""
key = f"{role}_{i}"
metadata = history.get('metadata', {})
if key not in metadata or 'versions' not in metadata[key]:
return ""
versions = metadata[key]['versions']
# Default to the last version if current_version_index isn't set in metadata
current_idx = metadata[key].get('current_version_index', len(versions) - 1 if versions else 0)
if len(versions) <= 1:
return ""
left_disabled = ' disabled' if current_idx == 0 else ''
right_disabled = ' disabled' if current_idx >= len(versions) - 1 else ''
left_arrow = f'<button class="footer-button version-nav-button"{left_disabled} onclick="navigateVersion(this, \'left\')" title="Previous version">&lt;</button>'
right_arrow = f'<button class="footer-button version-nav-button"{right_disabled} onclick="navigateVersion(this, \'right\')" title="Next version">&gt;</button>'
position = f'<span class="version-position">{current_idx + 1}/{len(versions)}</span>'
return f'<div class="version-navigation">{left_arrow}{position}{right_arrow}</div>'
def actions_html(history, i, role, info_message=""):
action_buttons = ""
version_nav_html = ""
if role == "assistant":
action_buttons = (
f'{copy_button}'
f'{edit_button}'
f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
f'{continue_button if i == len(history["visible"]) - 1 else ""}'
f'{remove_button if i == len(history["visible"]) - 1 else ""}'
f'{branch_button}'
)
version_nav_html = get_version_navigation_html(history, i, "assistant")
elif role == "user":
action_buttons = (
f'{copy_button}'
f'{edit_button}'
)
version_nav_html = get_version_navigation_html(history, i, "user")
return (f'<div class="message-actions">'
f'{action_buttons}'
f'{info_message}'
f'</div>'
f'{version_nav_html}')
def generate_instruct_html(history):
output = f'<style>{instruct_css}</style><div class="chat" id="chat"><div class="messages">'
output = f'<style>{instruct_css}</style><div class="chat" id="chat" data-mode="instruct"><div class="messages">'
for i in range(len(history['visible'])):
row_visible = history['visible'][i]
row_internal = history['internal'][i]
converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
# Get timestamps
user_timestamp = format_message_timestamp(history, "user", i)
assistant_timestamp = format_message_timestamp(history, "assistant", i)
# Get attachments
user_attachments = format_message_attachments(history, "user", i)
assistant_attachments = format_message_attachments(history, "assistant", i)
# Create info buttons for timestamps if they exist
info_message_user = ""
if user_timestamp != "":
# Extract the timestamp value from the span
user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0]
info_message_user = info_button.replace("message", user_timestamp_value)
info_message_assistant = ""
if assistant_timestamp != "":
# Extract the timestamp value from the span
assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0]
info_message_assistant = info_button.replace("message", assistant_timestamp_value)
if converted_visible[0]: # Don't display empty user messages
output += (
f'<div class="user-message" '
f'data-raw="{html.escape(row_internal[0], quote=True)}">'
f'data-raw="{html.escape(row_internal[0], quote=True)}"'
f'data-index={i}>'
f'<div class="text">'
f'<div class="message-body">{converted_visible[0]}</div>'
f'{copy_button}'
f'{user_attachments}'
f'{actions_html(history, i, "user", info_message_user)}'
f'</div>'
f'</div>'
)
output += (
f'<div class="assistant-message" '
f'data-raw="{html.escape(row_internal[1], quote=True)}">'
f'data-raw="{html.escape(row_internal[1], quote=True)}"'
f'data-index={i}>'
f'<div class="text">'
f'<div class="message-body">{converted_visible[1]}</div>'
f'{copy_button}'
f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
f'{continue_button if i == len(history["visible"]) - 1 else ""}'
f'{remove_button if i == len(history["visible"]) - 1 else ""}'
f'{assistant_attachments}'
f'{actions_html(history, i, "assistant", info_message_assistant)}'
f'</div>'
f'</div>'
)
@ -401,30 +520,39 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
row_internal = history['internal'][i]
converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
# Get timestamps
user_timestamp = format_message_timestamp(history, "user", i)
assistant_timestamp = format_message_timestamp(history, "assistant", i)
# Get attachments
user_attachments = format_message_attachments(history, "user", i)
assistant_attachments = format_message_attachments(history, "assistant", i)
if converted_visible[0]: # Don't display empty user messages
output += (
f'<div class="message" '
f'data-raw="{html.escape(row_internal[0], quote=True)}">'
f'data-raw="{html.escape(row_internal[0], quote=True)}"'
f'data-index={i}>'
f'<div class="circle-you">{img_me}</div>'
f'<div class="text">'
f'<div class="username">{name1}</div>'
f'<div class="username">{name1}{user_timestamp}</div>'
f'<div class="message-body">{converted_visible[0]}</div>'
f'{copy_button}'
f'{user_attachments}'
f'{actions_html(history, i, "user")}'
f'</div>'
f'</div>'
)
output += (
f'<div class="message" '
f'data-raw="{html.escape(row_internal[1], quote=True)}">'
f'data-raw="{html.escape(row_internal[1], quote=True)}"'
f'data-index={i}>'
f'<div class="circle-bot">{img_bot}</div>'
f'<div class="text">'
f'<div class="username">{name2}</div>'
f'<div class="username">{name2}{assistant_timestamp}</div>'
f'<div class="message-body">{converted_visible[1]}</div>'
f'{copy_button}'
f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
f'{continue_button if i == len(history["visible"]) - 1 else ""}'
f'{remove_button if i == len(history["visible"]) - 1 else ""}'
f'{assistant_attachments}'
f'{actions_html(history, i, "assistant")}'
f'</div>'
f'</div>'
)
@ -441,26 +569,48 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
row_internal = history['internal'][i]
converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
# Get timestamps
user_timestamp = format_message_timestamp(history, "user", i)
assistant_timestamp = format_message_timestamp(history, "assistant", i)
# Get attachments
user_attachments = format_message_attachments(history, "user", i)
assistant_attachments = format_message_attachments(history, "assistant", i)
# Create info buttons for timestamps if they exist
info_message_user = ""
if user_timestamp != "":
# Extract the timestamp value from the span
user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0]
info_message_user = info_button.replace("message", user_timestamp_value)
info_message_assistant = ""
if assistant_timestamp != "":
# Extract the timestamp value from the span
assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0]
info_message_assistant = info_button.replace("message", assistant_timestamp_value)
if converted_visible[0]: # Don't display empty user messages
output += (
f'<div class="message" '
f'data-raw="{html.escape(row_internal[0], quote=True)}">'
f'data-raw="{html.escape(row_internal[0], quote=True)}"'
f'data-index={i}>'
f'<div class="text-you">'
f'<div class="message-body">{converted_visible[0]}</div>'
f'{copy_button}'
f'{user_attachments}'
f'{actions_html(history, i, "user", info_message_user)}'
f'</div>'
f'</div>'
)
output += (
f'<div class="message" '
f'data-raw="{html.escape(row_internal[1], quote=True)}">'
f'data-raw="{html.escape(row_internal[1], quote=True)}"'
f'data-index={i}>'
f'<div class="text-bot">'
f'<div class="message-body">{converted_visible[1]}</div>'
f'{copy_button}'
f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
f'{continue_button if i == len(history["visible"]) - 1 else ""}'
f'{remove_button if i == len(history["visible"]) - 1 else ""}'
f'{assistant_attachments}'
f'{actions_html(history, i, "assistant", info_message_assistant)}'
f'</div>'
f'</div>'
)

View file

@ -66,7 +66,7 @@ class LlamaServer:
"top_k": state["top_k"],
"top_p": state["top_p"],
"min_p": state["min_p"],
"tfs_z": state["tfs"],
"top_n_sigma": state["top_n_sigma"] if state["top_n_sigma"] > 0 else -1,
"typical_p": state["typical_p"],
"repeat_penalty": state["repetition_penalty"],
"repeat_last_n": state["repetition_penalty_range"],
@ -102,8 +102,10 @@ class LlamaServer:
penalty_found = False
for s in samplers:
if s.strip() in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]:
if s.strip() in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc"]:
filtered_samplers.append(s.strip())
elif s.strip() == "typical_p":
filtered_samplers.append("typ_p")
elif not penalty_found and s.strip() == "repetition_penalty":
filtered_samplers.append("penalties")
penalty_found = True
@ -144,8 +146,9 @@ class LlamaServer:
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
print()
# Make a direct request with streaming enabled using a context manager
with self.session.post(url, json=payload, stream=True) as response:
# Make the generation request
response = self.session.post(url, json=payload, stream=True)
try:
response.raise_for_status() # Raise an exception for HTTP errors
full_text = ""
@ -182,6 +185,8 @@ class LlamaServer:
print(f"JSON decode error: {e}")
print(f"Problematic line: {line}")
continue
finally:
response.close()
def generate(self, prompt, state):
output = ""
@ -210,14 +215,15 @@ class LlamaServer:
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
print()
response = self.session.post(url, json=payload)
result = response.json()
for retry in range(5):
response = self.session.post(url, json=payload)
result = response.json()
if "completion_probabilities" in result:
if use_samplers:
return result["completion_probabilities"][0]["top_probs"]
else:
return result["completion_probabilities"][0]["top_logprobs"]
if "completion_probabilities" in result:
if use_samplers:
return result["completion_probabilities"][0]["top_probs"]
else:
return result["completion_probabilities"][0]["top_logprobs"]
else:
raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
@ -255,9 +261,10 @@ class LlamaServer:
self.server_path,
"--model", self.model_path,
"--ctx-size", str(shared.args.ctx_size),
"--n-gpu-layers", str(shared.args.n_gpu_layers),
"--gpu-layers", str(shared.args.gpu_layers),
"--batch-size", str(shared.args.batch_size),
"--port", str(self.port),
"--no-webui",
]
if shared.args.flash_attn:
@ -278,8 +285,10 @@ class LlamaServer:
cmd.append("--no-kv-offload")
if shared.args.row_split:
cmd += ["--split-mode", "row"]
cache_type = "fp16"
if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types:
cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type]
cache_type = shared.args.cache_type
if shared.args.compress_pos_emb != 1:
cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
if shared.args.rope_freq_base > 0:
@ -316,9 +325,15 @@ class LlamaServer:
for flag_item in extra_flags.split(','):
if '=' in flag_item:
flag, value = flag_item.split('=', 1)
cmd += [f"--{flag}", value]
if len(flag) <= 3:
cmd += [f"-{flag}", value]
else:
cmd += [f"--{flag}", value]
else:
cmd.append(f"--{flag_item}")
if len(flag_item) <= 3:
cmd.append(f"-{flag_item}")
else:
cmd.append(f"--{flag_item}")
env = os.environ.copy()
if os.name == 'posix':
@ -333,6 +348,7 @@ class LlamaServer:
print(' '.join(str(item) for item in cmd[1:]))
print()
logger.info(f"Using gpu_layers={shared.args.gpu_layers} | ctx_size={shared.args.ctx_size} | cache_type={cache_type}")
# Start the server with pipes for output
self.process = subprocess.Popen(
cmd,

View file

@ -5,7 +5,7 @@ import gradio as gr
loaders_and_params = OrderedDict({
'llama.cpp': [
'n_gpu_layers',
'gpu_layers',
'threads',
'threads_batch',
'batch_size',
@ -28,6 +28,7 @@ loaders_and_params = OrderedDict({
'device_draft',
'ctx_size_draft',
'speculative_decoding_accordion',
'vram_info',
],
'Transformers': [
'gpu_split',
@ -84,17 +85,11 @@ loaders_and_params = OrderedDict({
'no_flash_attn',
'no_xformers',
'no_sdpa',
'exllamav2_info',
'model_draft',
'draft_max',
'ctx_size_draft',
'speculative_decoding_accordion',
],
'HQQ': [
'hqq_backend',
'trust_remote_code',
'no_use_fast',
],
'TensorRT-LLM': [
'ctx_size',
'cpp_runner',
@ -158,7 +153,6 @@ def transformers_samplers():
loaders_samplers = {
'Transformers': transformers_samplers(),
'HQQ': transformers_samplers(),
'ExLlamav3_HF': {
'temperature',
'dynatemp_low',
@ -299,7 +293,7 @@ loaders_samplers = {
'typical_p',
'xtc_threshold',
'xtc_probability',
'tfs',
'top_n_sigma',
'dry_multiplier',
'dry_allowed_length',
'dry_base',

View file

@ -7,6 +7,7 @@ from modules import models, shared
from modules.logging_colors import logger
from modules.models import load_model
from modules.text_generation import generate_reply
from modules.utils import check_model_loaded
global_scores = None
@ -33,9 +34,9 @@ def get_next_logits(*args, **kwargs):
def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False):
if shared.model is None:
logger.error("No model is loaded! Select one in the Model tab.")
return 'Error: No model is loaded1 Select one in the Model tab.', previous
model_is_loaded, error_message = check_model_loaded()
if not model_is_loaded:
return error_message, previous
# llama.cpp case
if shared.model.__class__.__name__ == 'LlamaServer':

View file

@ -21,7 +21,6 @@ def load_model(model_name, loader=None):
'ExLlamav3_HF': ExLlamav3_HF_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader,
'ExLlamav2': ExLlamav2_loader,
'HQQ': HQQ_loader,
'TensorRT-LLM': TensorRT_LLM_loader,
}
@ -71,7 +70,6 @@ def llama_cpp_server_loader(model_name):
else:
model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
logger.info(f"llama.cpp weights detected: \"{model_file}\"")
try:
model = LlamaServer(model_file)
return model, model
@ -103,21 +101,6 @@ def ExLlamav2_loader(model_name):
return model, tokenizer
def HQQ_loader(model_name):
try:
from hqq.core.quantize import HQQBackend, HQQLinear
from hqq.models.hf.base import AutoHQQHFModel
except ModuleNotFoundError:
raise ModuleNotFoundError("Failed to import 'hqq'. Please install it manually following the instructions in the HQQ GitHub repository.")
logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"")
model_dir = Path(f'{shared.args.model_dir}/{model_name}')
model = AutoHQQHFModel.from_quantized(str(model_dir))
HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend))
return model
def TensorRT_LLM_loader(model_name):
try:
from modules.tensorrt_llm import TensorRTLLMModel
@ -133,10 +116,13 @@ def unload_model(keep_model_name=False):
return
is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
if shared.args.loader == 'ExLlamav3_HF':
shared.model.unload()
shared.model = shared.tokenizer = None
shared.lora_names = []
shared.model_dirty_from_training = False
if not is_llamacpp:
from modules.torch_utils import clear_torch_cache
clear_torch_cache()

View file

@ -1,7 +1,11 @@
import functools
import json
import re
import subprocess
from math import floor
from pathlib import Path
import gradio as gr
import yaml
from modules import chat, loaders, metadata_gguf, shared, ui
@ -54,7 +58,7 @@ def get_model_metadata(model):
else:
model_file = list(path.glob('*.gguf'))[0]
metadata = metadata_gguf.load_metadata(model_file)
metadata = load_gguf_metadata_with_cache(model_file)
for k in metadata:
if k.endswith('context_length'):
@ -67,7 +71,8 @@ def get_model_metadata(model):
elif k.endswith('rope.scaling.factor'):
model_settings['compress_pos_emb'] = metadata[k]
elif k.endswith('block_count'):
model_settings['n_gpu_layers'] = metadata[k] + 1
model_settings['gpu_layers'] = metadata[k] + 1
model_settings['max_gpu_layers'] = metadata[k] + 1
if 'tokenizer.chat_template' in metadata:
template = metadata['tokenizer.chat_template']
@ -149,7 +154,11 @@ def get_model_metadata(model):
for pat in settings:
if re.match(pat.lower(), Path(model).name.lower()):
for k in settings[pat]:
model_settings[k] = settings[pat][k]
new_k = k
if k == 'n_gpu_layers':
new_k = 'gpu_layers'
model_settings[new_k] = settings[pat][k]
# Load instruction template if defined by name rather than by value
if model_settings['instruction_template'] != 'Custom (obtained from model metadata)':
@ -174,8 +183,6 @@ def infer_loader(model_name, model_settings, hf_quant_method=None):
loader = 'ExLlamav3_HF'
elif re.match(r'.*exl2', model_name.lower()):
loader = 'ExLlamav2_HF'
elif re.match(r'.*-hqq', model_name.lower()):
return 'HQQ'
else:
loader = 'Transformers'
@ -209,15 +216,27 @@ def apply_model_settings_to_state(model, state):
model_settings = get_model_metadata(model)
if 'loader' in model_settings:
loader = model_settings.pop('loader')
# If the user is using an alternative loader for the same model type, let them keep using it
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
state['loader'] = loader
for k in model_settings:
if k in state:
if k in state and k != 'gpu_layers': # Skip gpu_layers, handle separately
state[k] = model_settings[k]
# Handle GPU layers and VRAM update for llama.cpp
if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings:
vram_info, gpu_layers_update = update_gpu_layers_and_vram(
state['loader'],
model,
model_settings['gpu_layers'],
state['ctx_size'],
state['cache_type'],
auto_adjust=True
)
state['gpu_layers'] = gpu_layers_update
state['vram_info'] = vram_info
return state
@ -277,3 +296,197 @@ def save_instruction_template(model, template):
yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.")
else:
yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")
@functools.lru_cache(maxsize=1)
def load_gguf_metadata_with_cache(model_file):
return metadata_gguf.load_metadata(model_file)
def get_model_size_mb(model_file: Path) -> float:
filename = model_file.name
# Check for multipart pattern
match = re.match(r'(.+)-\d+-of-\d+\.gguf$', filename)
if match:
# It's a multipart file, find all matching parts
base_pattern = match.group(1)
part_files = sorted(model_file.parent.glob(f'{base_pattern}-*-of-*.gguf'))
total_size = sum(p.stat().st_size for p in part_files)
else:
# Single part
total_size = model_file.stat().st_size
return total_size / (1024 ** 2) # Return size in MB
def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
model_file = Path(f'{shared.args.model_dir}/{gguf_file}')
metadata = load_gguf_metadata_with_cache(model_file)
size_in_mb = get_model_size_mb(model_file)
# Extract values from metadata
n_layers = None
n_kv_heads = None
embedding_dim = None
for key, value in metadata.items():
if key.endswith('.block_count'):
n_layers = value
elif key.endswith('.attention.head_count_kv'):
n_kv_heads = max(value) if isinstance(value, list) else value
elif key.endswith('.embedding_length'):
embedding_dim = value
if gpu_layers > n_layers:
gpu_layers = n_layers
# Convert cache_type to numeric
if cache_type == 'q4_0':
cache_type = 4
elif cache_type == 'q8_0':
cache_type = 8
else:
cache_type = 16
# Derived features
size_per_layer = size_in_mb / max(n_layers, 1e-6)
kv_cache_factor = n_kv_heads * cache_type * ctx_size
embedding_per_context = embedding_dim / ctx_size
# Calculate VRAM using the model
# Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
vram = (
(size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor)
* (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632)))
+ 1516.522943869404
)
return vram
def get_nvidia_vram(return_free=True):
"""
Calculates VRAM statistics across all NVIDIA GPUs by parsing nvidia-smi output.
Args:
return_free (bool): If True, returns free VRAM. If False, returns total VRAM.
Returns:
int: Either the total free VRAM or total VRAM in MiB summed across all detected NVIDIA GPUs.
Returns -1 if nvidia-smi command fails (not found, error, etc.).
Returns 0 if nvidia-smi succeeds but no GPU memory info found.
"""
try:
# Execute nvidia-smi command
result = subprocess.run(
['nvidia-smi'],
capture_output=True,
text=True,
check=False
)
# Check if nvidia-smi returned an error
if result.returncode != 0:
return -1
# Parse the output for memory usage patterns
output = result.stdout
# Find memory usage like "XXXXMiB / YYYYMiB"
# Captures used and total memory for each GPU
matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output)
if not matches:
# No GPUs found in expected format
return 0
total_vram_mib = 0
total_free_vram_mib = 0
for used_mem_str, total_mem_str in matches:
try:
used_mib = int(used_mem_str)
total_mib = int(total_mem_str)
total_vram_mib += total_mib
total_free_vram_mib += (total_mib - used_mib)
except ValueError:
# Skip malformed entries
pass
# Return either free or total VRAM based on the flag
return total_free_vram_mib if return_free else total_vram_mib
except FileNotFoundError:
# nvidia-smi not found (likely no NVIDIA drivers installed)
return -1
except Exception:
# Handle any other unexpected exceptions
return -1
def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True):
"""
Unified function to handle GPU layers and VRAM updates.
Args:
for_ui: If True, returns Gradio updates. If False, returns raw values.
Returns:
- If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update
- If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage
"""
if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"):
vram_info = "<div id=\"vram-info\"'>Estimated VRAM to load the model:</div>"
if for_ui:
return (vram_info, gr.update()) if auto_adjust else vram_info
else:
return (0, gpu_layers) if auto_adjust else 0
current_layers = gpu_layers
max_layers = gpu_layers
if auto_adjust:
# Get model settings including user preferences
model_settings = get_model_metadata(model)
# Get the true maximum layers
max_layers = model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', gpu_layers))
# Check if this is a user-saved setting
user_config = shared.user_config
model_regex = Path(model).name + '$'
has_user_setting = model_regex in user_config and 'gpu_layers' in user_config[model_regex]
if has_user_setting:
# For user settings, just use the current value (which already has user pref)
# but ensure the slider maximum is correct
current_layers = gpu_layers # Already has user setting
else:
# No user setting, auto-adjust from the maximum
current_layers = max_layers # Start from max
# Auto-adjust based on available/total VRAM
# If a model is loaded and it's for the UI, use the total VRAM to avoid confusion
return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True
available_vram = get_nvidia_vram(return_free=return_free)
if available_vram > 0:
tolerance = 577
while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
current_layers -= 1
# Calculate VRAM with current layers
vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type)
if for_ui:
vram_info = f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{vram_usage:.0f} MiB</span></div>"
if auto_adjust:
return vram_info, gr.update(value=current_layers, maximum=max_layers)
else:
return vram_info
else:
if auto_adjust:
return vram_usage, current_layers
else:
return vram_usage

View file

@ -11,7 +11,7 @@ from modules.logging_colors import logger
def default_preset():
return {
result = {
'temperature': 1,
'dynatemp_low': 1,
'dynatemp_high': 1,
@ -46,10 +46,17 @@ def default_preset():
'do_sample': True,
'dynamic_temperature': False,
'temperature_last': False,
'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_n_sigma\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntop_n_sigma\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
}
if shared.args.portable:
samplers = result['sampler_priority'].split('\n')
samplers = [sampler for sampler in samplers if sampler in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc", "typical_p", "repetition_penalty"]]
result['sampler_priority'] = '\n'.join(samplers)
return result
def presets_params():
return [k for k in default_preset()]

View file

@ -60,7 +60,6 @@ settings = {
'custom_stopping_strings': '',
'custom_token_bans': '',
'negative_prompt': '',
'autoload_model': False,
'dark_theme': True,
'default_extensions': [],
'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}",
@ -88,7 +87,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft
# Model loader
group = parser.add_argument_group('Model loader')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ, TensorRT-LLM.')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.')
# Transformers/Accelerate
group = parser.add_argument_group('Transformers/Accelerate')
@ -121,7 +120,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
@ -130,9 +129,9 @@ group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to
group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
# Cache
group = parser.add_argument_group('Context and cache management')
group = parser.add_argument_group('Context and cache')
group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
# Speculative decoding
group = parser.add_argument_group('Speculative decoding')
@ -153,18 +152,10 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n
group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
# HQQ
group = parser.add_argument_group('HQQ')
group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
# TensorRT-LLM
group = parser.add_argument_group('TensorRT-LLM')
group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
# Cache
group = parser.add_argument_group('Cache')
group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
# DeepSpeed
group = parser.add_argument_group('DeepSpeed')
group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@ -190,6 +181,7 @@ group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certific
group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None)
group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy')
group.add_argument('--old-colors', action='store_true', help='Use the legacy Gradio colors, before the December/2024 update.')
group.add_argument('--portable', action='store_true', help='Hide features not available in portable mode like training.')
# API
group = parser.add_argument_group('API')
@ -267,8 +259,6 @@ def fix_loader_name(name):
return 'ExLlamav2_HF'
elif name in ['exllamav3-hf', 'exllamav3_hf', 'exllama-v3-hf', 'exllama_v3_hf', 'exllama-v3_hf', 'exllama3-hf', 'exllama3_hf', 'exllama-3-hf', 'exllama_3_hf', 'exllama-3_hf']:
return 'ExLlamav3_HF'
elif name in ['hqq']:
return 'HQQ'
elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
return 'TensorRT-LLM'
@ -311,11 +301,13 @@ if args.api or args.public_api:
add_extension('openai', last=True)
# Load model-specific settings
with Path(f'{args.model_dir}/config.yaml') as p:
if p.exists():
model_config = yaml.safe_load(open(p, 'r').read())
else:
model_config = {}
p = Path(f'{args.model_dir}/config.yaml')
if p.exists():
model_config = yaml.safe_load(open(p, 'r').read())
else:
model_config = {}
del p
# Load custom model-specific settings
user_config = load_user_config()

View file

@ -1,15 +1,15 @@
from pathlib import Path
import torch
import tensorrt_llm
import torch
from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
from modules import shared
from modules.logging_colors import logger
from modules.text_generation import (
get_max_prompt_length,
get_reply_from_output_ids
)
from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
class TensorRTLLMModel:

View file

@ -14,6 +14,7 @@ from modules.callbacks import Iteratorize
from modules.extensions import apply_extensions
from modules.html_generator import generate_basic_html
from modules.logging_colors import logger
from modules.utils import check_model_loaded
def generate_reply(*args, **kwargs):
@ -34,8 +35,8 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
# Find the appropriate generation function
generate_func = apply_extensions('custom_generate_reply')
if generate_func is None:
if shared.model_name == 'None' or shared.model is None:
logger.error("No model is loaded! Select one in the Model tab.")
model_is_loaded, error_message = check_model_loaded()
if not model_is_loaded:
yield ''
return
@ -471,7 +472,7 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
t1 = time.time()
original_tokens = len(original_input_ids[0])
new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
return
@ -480,7 +481,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
For models that do not use the transformers library for sampling
"""
seed = set_manual_seed(state['seed'])
state['seed'] = set_manual_seed(state['seed'])
t0 = time.time()
reply = ''
try:
@ -500,15 +501,15 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
t1 = time.time()
original_tokens = len(encode(original_question)[0])
new_tokens = len(encode(original_question + reply)[0]) - original_tokens
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {state["seed"]})')
return
def print_prompt(prompt, max_chars=2000):
def print_prompt(prompt, max_chars=-1):
DARK_YELLOW = "\033[38;5;3m"
RESET = "\033[0m"
if len(prompt) > max_chars:
if max_chars > 0 and len(prompt) > max_chars:
half_chars = max_chars // 2
hidden_len = len(prompt[half_chars:-half_chars])
hidden_msg = f"{DARK_YELLOW}[...{hidden_len} characters hidden...]{RESET}"

View file

@ -61,7 +61,7 @@ if not shared.args.old_colors:
background_fill_primary_dark='var(--darker-gray)',
body_background_fill="white",
block_background_fill="transparent",
body_text_color="#333",
body_text_color='rgb(64, 64, 64)',
button_secondary_background_fill="#f4f4f4",
button_secondary_border_color="var(--border-color-primary)",
@ -71,6 +71,7 @@ if not shared.args.old_colors:
block_background_fill_dark='transparent',
block_border_color_dark='transparent',
input_border_color_dark='var(--border-color-dark)',
input_border_color_focus_dark='var(--border-color-dark)',
checkbox_border_color_dark='var(--border-color-dark)',
border_color_primary_dark='var(--border-color-dark)',
button_secondary_border_color_dark='var(--border-color-dark)',
@ -89,6 +90,8 @@ if not shared.args.old_colors:
checkbox_label_shadow='none',
block_shadow='none',
block_shadow_dark='none',
input_shadow_focus='none',
input_shadow_focus_dark='none',
button_large_radius='0.375rem',
button_large_padding='6px 12px',
input_radius='0.375rem',
@ -105,11 +108,10 @@ def list_model_elements():
'filter_by_loader',
'loader',
'cpu_memory',
'n_gpu_layers',
'gpu_layers',
'threads',
'threads_batch',
'batch_size',
'hqq_backend',
'ctx_size',
'cache_type',
'tensor_split',
@ -211,6 +213,15 @@ def list_interface_input_elements():
'negative_prompt',
'dry_sequence_breakers',
'grammar_string',
'navigate_message_index',
'navigate_direction',
'navigate_message_role',
'edit_message_index',
'edit_message_text',
'edit_message_role',
'branch_index',
'enable_web_search',
'web_search_pages',
]
# Chat elements

View file

@ -24,7 +24,8 @@ def create_ui():
with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
with gr.Column():
with gr.Row(elem_id='past-chats-buttons'):
shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', interactive=not mu)
shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', elem_id='Branch', interactive=not mu)
shared.gradio['branch_index'] = gr.Number(value=-1, precision=0, visible=False, elem_id="Branch-index", interactive=True)
shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input'])
@ -46,14 +47,14 @@ def create_ui():
with gr.Row():
with gr.Column(elem_id='chat-col'):
shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
shared.gradio['display'] = gr.JSON(value={}, visible=False) # Hidden buffer
shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': [], 'metadata': {}}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
with gr.Row(elem_id="chat-input-row"):
with gr.Column(scale=1, elem_id='gr-hover-container'):
gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')
with gr.Column(scale=10, elem_id='chat-input-container'):
shared.gradio['textbox'] = gr.Textbox(label='', placeholder='Send a message', elem_id='chat-input', elem_classes=['add_scrollbar'])
shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar'])
shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls')
shared.gradio['typing-dots'] = gr.HTML(value='<div class="typing"><span></span><span class="dot1"></span><span class="dot2"></span></div>', label='typing', elem_id='typing-container')
@ -70,8 +71,6 @@ def create_ui():
shared.gradio['Remove last'] = gr.Button('Remove last reply (Ctrl + Shift + Backspace)', elem_id='Remove-last')
with gr.Row():
shared.gradio['Replace last reply'] = gr.Button('Replace last reply (Ctrl + Shift + L)', elem_id='Replace-last')
shared.gradio['Copy last reply'] = gr.Button('Copy last reply (Ctrl + Shift + K)', elem_id='Copy-last')
shared.gradio['Impersonate'] = gr.Button('Impersonate (Ctrl + Shift + M)', elem_id='Impersonate')
with gr.Row():
@ -79,14 +78,20 @@ def create_ui():
shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
with gr.Row():
shared.gradio['send-chat-to-default'] = gr.Button('Send to default')
shared.gradio['send-chat-to-notebook'] = gr.Button('Send to notebook')
shared.gradio['send-chat-to-default'] = gr.Button('Send to Default')
shared.gradio['send-chat-to-notebook'] = gr.Button('Send to Notebook')
with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']):
with gr.Column():
with gr.Row():
shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
with gr.Row():
shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search')
with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']:
shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10)
with gr.Row():
shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
@ -96,6 +101,22 @@ def create_ui():
with gr.Row():
shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
with gr.Row():
shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm')
shared.gradio['token_display'] = gr.HTML(value='', elem_classes='token-display')
# Hidden elements for version navigation and editing
with gr.Row(visible=False):
shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index")
shared.gradio['navigate_direction'] = gr.Textbox(value="", elem_id="Navigate-direction")
shared.gradio['navigate_message_role'] = gr.Textbox(value="", elem_id="Navigate-message-role")
shared.gradio['navigate_version'] = gr.Button(elem_id="Navigate-version")
shared.gradio['edit_message_index'] = gr.Number(value=-1, precision=0, elem_id="Edit-message-index")
shared.gradio['edit_message_text'] = gr.Textbox(value="", elem_id="Edit-message-text")
shared.gradio['edit_message_role'] = gr.Textbox(value="", elem_id="Edit-message-role")
shared.gradio['edit_message'] = gr.Button(elem_id="Edit-message")
def create_chat_settings_ui():
mu = shared.args.multi_user
@ -185,7 +206,7 @@ def create_event_handlers():
shared.gradio['Generate'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
@ -193,7 +214,7 @@ def create_event_handlers():
shared.gradio['textbox'].submit(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
@ -221,10 +242,6 @@ def create_event_handlers():
None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
shared.gradio['Replace last reply'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
chat.handle_replace_last_reply_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False)
shared.gradio['Send dummy message'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
chat.handle_send_dummy_message_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False)
@ -258,7 +275,7 @@ def create_event_handlers():
shared.gradio['branch_chat'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id', 'branch_index'), show_progress=False)
shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename-row'), show_progress=False)
shared.gradio['rename_to-cancel'].click(lambda: gr.update(visible=False), None, gradio('rename-row'), show_progress=False)
@ -290,7 +307,14 @@ def create_event_handlers():
None, gradio('mode'), None, js="(mode) => {mode === 'instruct' ? document.getElementById('character-menu').parentNode.parentNode.style.display = 'none' : document.getElementById('character-menu').parentNode.parentNode.style.display = ''}")
shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'), show_progress=False)
shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
shared.gradio['navigate_version'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
chat.handle_navigate_version_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False)
shared.gradio['edit_message'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False)
# Save/delete a character
shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False)
@ -347,3 +371,13 @@ def create_event_handlers():
None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
shared.gradio['show_controls'].change(None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
shared.gradio['count_tokens'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
chat.count_prompt_tokens, gradio('textbox', 'interface_state'), gradio('token_display'), show_progress=False)
shared.gradio['enable_web_search'].change(
lambda x: gr.update(visible=x),
gradio('enable_web_search'),
gradio('web_search_row')
)

View file

@ -14,6 +14,7 @@ from modules.models_settings import (
get_model_metadata,
save_instruction_template,
save_model_settings,
update_gpu_layers_and_vram,
update_model_parameters
)
from modules.utils import gradio
@ -26,71 +27,34 @@ def create_ui():
with gr.Row():
with gr.Column():
with gr.Row():
with gr.Column():
with gr.Row():
shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button', interactive=not mu)
shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu)
shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
with gr.Column():
with gr.Row():
shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
with gr.Row():
with gr.Column():
shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None)
shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None)
with gr.Blocks():
gr.Markdown("## Main options")
with gr.Row():
with gr.Column():
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
with gr.Column():
shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
# Speculative decoding
@ -99,15 +63,50 @@ def create_ui():
shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')
shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding. Recommended value: 4.')
shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
with gr.Column():
with gr.Row():
shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu)
gr.Markdown("## Other options")
with gr.Accordion("See more options", open=False, elem_classes='tgw-accordion'):
with gr.Row():
with gr.Column():
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
with gr.Column():
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
if not shared.args.portable:
with gr.Row():
shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
with gr.Column():
with gr.Tab("Download"):
shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu)
shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1, interactive=not mu)
@ -132,11 +131,10 @@ def create_event_handlers():
# In this event handler, the interface state is read and updated
# with the model defaults (if any), and then the model is loaded
# unless "autoload_model" is unchecked
shared.gradio['model_menu'].change(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then(
load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=True).success(
partial(load_model_wrapper, autoload=False), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
shared.gradio['load_model'].click(
@ -145,15 +143,31 @@ def create_event_handlers():
partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False)
shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False).then(
partial(update_gpu_layers_and_vram, auto_adjust=True), gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info', 'gpu_layers'), show_progress=False)
shared.gradio['save_model_settings'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
# For ctx_size and cache_type - auto-adjust GPU layers
for param in ['ctx_size', 'cache_type']:
shared.gradio[param].change(
partial(update_gpu_layers_and_vram, auto_adjust=True),
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
gradio('vram_info', 'gpu_layers'), show_progress=False)
# For manual gpu_layers changes - only update VRAM
shared.gradio['gpu_layers'].change(
partial(update_gpu_layers_and_vram, auto_adjust=False),
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
gradio('vram_info'), show_progress=False)
if not shared.args.portable:
shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))
shared.gradio['customized_template_submit'].click(save_instruction_template, gradio('model_menu', 'customized_template'), gradio('model_status'), show_progress=True)
@ -192,6 +206,26 @@ def load_lora_wrapper(selected_loras):
def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):
try:
# Handle direct GGUF URLs
if repo_id.startswith("https://") and ("huggingface.co" in repo_id) and (repo_id.endswith(".gguf") or repo_id.endswith(".gguf?download=true")):
try:
path = repo_id.split("huggingface.co/")[1]
# Extract the repository ID (first two parts of the path)
parts = path.split("/")
if len(parts) >= 2:
extracted_repo_id = f"{parts[0]}/{parts[1]}"
# Extract the filename (last part of the path)
filename = repo_id.split("/")[-1]
if "?download=true" in filename:
filename = filename.replace("?download=true", "")
repo_id = extracted_repo_id
specific_file = filename
except:
pass
if repo_id == "":
yield ("Please enter a model path")
return
@ -205,6 +239,18 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
yield ("Getting the download links from Hugging Face")
links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file)
# Check for multiple GGUF files
gguf_files = [link for link in links if link.lower().endswith('.gguf')]
if len(gguf_files) > 1 and not specific_file:
output = "Multiple GGUF files found. Please copy one of the following filenames to the 'File name' field:\n\n```\n"
for link in gguf_files:
output += f"{Path(link).name}\n"
output += "```"
yield output
return
if return_links:
output = "```\n"
for link in links:
@ -252,10 +298,34 @@ def update_truncation_length(current_length, state):
return current_length
def get_initial_vram_info():
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
return update_gpu_layers_and_vram(
shared.args.loader,
shared.model_name,
shared.args.gpu_layers,
shared.args.ctx_size,
shared.args.cache_type,
auto_adjust=False,
for_ui=True
)
return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</div>"
def get_initial_gpu_layers_max():
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
model_settings = get_model_metadata(shared.model_name)
return model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', 256))
return 256
def handle_load_model_event_initial(model, state):
state = apply_model_settings_to_state(model, state)
output = ui.apply_interface_values(state)
update_model_parameters(state)
update_model_parameters(state) # This updates the command-line flags
return output + [state]

View file

@ -21,7 +21,7 @@ def create_ui(default_preset):
shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button')
with gr.Column():
shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown')
shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()) if not shared.args.portable else ['llama.cpp'], value="All", elem_classes='slim-dropdown')
with gr.Row():
with gr.Column():
@ -82,7 +82,7 @@ def create_ui(default_preset):
shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='enable_thinking', info='Used by Qwen3 to toggle <think> mode.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Qwen3 to toggle <think> mode.')
shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')

View file

@ -23,11 +23,15 @@ def create_ui():
shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table')
with gr.Column():
extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️ WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu)
extension_status = gr.Markdown()
if not shared.args.portable:
extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️ WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu)
extension_status = gr.Markdown()
else:
pass
shared.gradio['theme_state'] = gr.Textbox(visible=False, value='dark' if shared.settings['dark_theme'] else 'light')
extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False)
if not shared.args.portable:
extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False)
# Reset interface event
shared.gradio['reset_interface'].click(

View file

@ -72,6 +72,20 @@ def natural_keys(text):
return [atoi(c) for c in re.split(r'(\d+)', text)]
def check_model_loaded():
if shared.model_name == 'None' or shared.model is None:
if len(get_available_models()) == 0:
error_msg = "No model is loaded.\n\nTo get started:\n1) Place a GGUF file in your user_data/models folder\n2) Go to the Model tab and select it"
logger.error(error_msg)
return False, error_msg
else:
error_msg = "No model is loaded. Please select one in the Model tab."
logger.error(error_msg)
return False, error_msg
return True, None
def get_available_models():
# Get all GGUF files
gguf_files = get_available_ggufs()
@ -123,7 +137,7 @@ def get_available_models():
model_dirs = sorted(model_dirs, key=natural_keys)
return ['None'] + filtered_gguf_files + model_dirs
return filtered_gguf_files + model_dirs
def get_available_ggufs():

129
modules/web_search.py Normal file
View file

@ -0,0 +1,129 @@
import concurrent.futures
from concurrent.futures import as_completed
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from modules.logging_colors import logger
def get_current_timestamp():
"""Returns the current time in 24-hour format"""
return datetime.now().strftime('%b %d, %Y %H:%M')
def download_web_page(url, timeout=5):
"""Download and extract text from a web page"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get text and clean it up
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
logger.error(f"Error downloading {url}: {e}")
return f"[Error downloading content from {url}: {str(e)}]"
def perform_web_search(query, num_pages=3, max_workers=5):
"""Perform web search and return results with content"""
try:
with DDGS() as ddgs:
results = list(ddgs.text(query, max_results=num_pages))
# Prepare download tasks
download_tasks = []
for i, result in enumerate(results):
url = result.get('href', '')
title = result.get('title', f'Search Result {i+1}')
download_tasks.append((url, title, i))
search_results = [None] * len(download_tasks) # Pre-allocate to maintain order
# Download pages in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all download tasks
future_to_task = {
executor.submit(download_web_page, task[0]): task
for task in download_tasks
}
# Collect results as they complete
for future in as_completed(future_to_task):
url, title, index = future_to_task[future]
try:
content = future.result()
search_results[index] = {
'title': title,
'url': url,
'content': content
}
except Exception as e:
logger.error(f"Error downloading {url}: {e}")
# Include failed downloads with empty content
search_results[index] = {
'title': title,
'url': url,
'content': ''
}
return search_results
except Exception as e:
logger.error(f"Error performing web search: {e}")
return []
def add_web_search_attachments(history, row_idx, user_message, search_query, state):
"""Perform web search and add results as attachments"""
if not search_query:
logger.warning("No search query provided")
return
try:
logger.info(f"Using search query: {search_query}")
# Perform web search
num_pages = int(state.get('web_search_pages', 3))
search_results = perform_web_search(search_query, num_pages)
if not search_results:
logger.warning("No search results found")
return
# Add search results as attachments
key = f"user_{row_idx}"
if key not in history['metadata']:
history['metadata'][key] = {"timestamp": get_current_timestamp()}
if "attachments" not in history['metadata'][key]:
history['metadata'][key]["attachments"] = []
for result in search_results:
attachment = {
"name": result['title'],
"type": "text/html",
"url": result['url'],
"content": result['content']
}
history['metadata'][key]["attachments"].append(attachment)
logger.info(f"Added {len(search_results)} web search results as attachments")
except Exception as e:
logger.error(f"Error in web search: {e}")

View file

@ -126,7 +126,7 @@ def check_env():
sys.exit(1)
# Ensure this is a new environment and not the base environment
if os.environ["CONDA_DEFAULT_ENV"] == "base":
if os.environ.get("CONDA_DEFAULT_ENV", "") == "base":
print("Create an environment for this project and activate it. Exiting...")
sys.exit(1)
@ -222,7 +222,7 @@ def update_pytorch_and_python():
if "+cu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
elif "+rocm" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4"
elif "+cpu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
elif "+cxx11" in torver:
@ -273,7 +273,7 @@ def install_webui():
"What is your GPU?",
{
'A': 'NVIDIA - CUDA 12.4',
'B': 'AMD - Linux/macOS only, requires ROCm 6.1',
'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4',
'C': 'Apple M Series',
'D': 'Intel Arc (beta)',
'N': 'CPU mode'
@ -314,7 +314,7 @@ def install_webui():
if selected_gpu == "NVIDIA":
install_pytorch += "--index-url https://download.pytorch.org/whl/cu124"
elif selected_gpu == "AMD":
install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.1"
install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.2.4"
elif selected_gpu in ["APPLE", "NONE"]:
install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
elif selected_gpu == "INTEL":

View file

@ -1,7 +1,9 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
bitsandbytes==0.45.*
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -13,6 +15,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -30,12 +33,12 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -29,6 +32,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -29,6 +32,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -29,7 +32,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -29,8 +32,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -29,5 +32,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -29,5 +32,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -1,7 +1,9 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
bitsandbytes==0.45.*
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -13,6 +15,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -30,12 +33,12 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -1,6 +1,8 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
@ -12,6 +14,7 @@ peft==0.15.*
Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,18 +0,0 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,18 +0,0 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,6 +18,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,9 +1,12 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
pyyaml
requests
rich
@ -15,5 +18,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -51,6 +51,7 @@ from modules.models import load_model, unload_model_if_idle
from modules.models_settings import (
get_fallback_settings,
get_model_metadata,
update_gpu_layers_and_vram,
update_model_parameters
)
from modules.shared import do_cmd_flags_warnings
@ -90,7 +91,7 @@ def create_interface():
'instruction_template_str': shared.settings['instruction_template_str'],
'prompt_menu-default': shared.settings['prompt-default'],
'prompt_menu-notebook': shared.settings['prompt-notebook'],
'filter_by_loader': shared.args.loader or 'All'
'filter_by_loader': (shared.args.loader or 'All') if not shared.args.portable else 'llama.cpp'
})
if Path("user_data/cache/pfp_character.png").exists():
@ -127,7 +128,8 @@ def create_interface():
ui_parameters.create_ui(shared.settings['preset']) # Parameters tab
ui_model_menu.create_ui() # Model tab
training.create_ui() # Training tab
if not shared.args.portable:
training.create_ui() # Training tab
ui_session.create_ui() # Session tab
# Generation events
@ -247,6 +249,20 @@ if __name__ == "__main__":
model_settings = get_model_metadata(model_name)
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
# Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
vram_usage, adjusted_layers = update_gpu_layers_and_vram(
shared.args.loader,
model_name,
model_settings['gpu_layers'],
shared.args.ctx_size,
shared.args.cache_type,
auto_adjust=True,
for_ui=False
)
shared.args.gpu_layers = adjusted_layers
# Load the model
shared.model, shared.tokenizer = load_model(model_name)
if shared.args.lora:

View file

@ -1,10 +1,15 @@
#!/usr/bin/env bash
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
cd "$(dirname "${BASH_SOURCE[0]}")"
# Portable install case
if [ -d "portable_env" ]; then
./portable_env/bin/python3 server.py --api --auto-launch "$@"
./portable_env/bin/python3 server.py --portable --api --auto-launch "$@"
exit $?
fi
@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
exit
fi
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
export CUDA_PATH="$INSTALL_ENV_DIR"
export CUDA_HOME="$CUDA_PATH"

View file

@ -1,10 +1,15 @@
#!/bin/bash
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
cd "$(dirname "${BASH_SOURCE[0]}")"
# Portable install case
if [ -d "portable_env" ]; then
./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@"
./portable_env/bin/python3 server.py --portable --api --auto-launch --api-port 5005 "$@"
exit $?
fi
@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
exit
fi
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
export CUDA_PATH="$INSTALL_ENV_DIR"
export CUDA_HOME="$CUDA_PATH"

View file

@ -1,11 +1,16 @@
@echo off
setlocal enabledelayedexpansion
@rem environment isolation
set PYTHONNOUSERSITE=1
set PYTHONPATH=
set PYTHONHOME=
cd /D "%~dp0"
@rem Portable install case
if exist "portable_env" (
.\portable_env\python.exe server.py --api --auto-launch %*
.\portable_env\python.exe server.py --portable --api --auto-launch %*
exit /b %errorlevel%
)
@ -87,10 +92,6 @@ if not exist "%INSTALL_ENV_DIR%" (
@rem check if conda environment was actually created
if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
@rem environment isolation
set PYTHONNOUSERSITE=1
set PYTHONPATH=
set PYTHONHOME=
set "CUDA_PATH=%INSTALL_ENV_DIR%"
set "CUDA_HOME=%CUDA_PATH%"

View file

@ -31,7 +31,6 @@ seed: -1
custom_stopping_strings: ''
custom_token_bans: ''
negative_prompt: ''
autoload_model: false
dark_theme: true
default_extensions: []
instruction_template_str: |-