From 7c29879e795776ceb742a8ddb47fd3843069cf34 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 30 May 2025 11:17:47 -0700 Subject: [PATCH 1/4] Fix 'Start reply with' (closes #7033) --- modules/chat.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/modules/chat.py b/modules/chat.py index e526a9a0..881f7330 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -806,9 +806,12 @@ def remove_last_message(history): return html.unescape(last[0]), history -def send_dummy_message(textbox, state): +def send_dummy_message(text, state): history = state['history'] - text = textbox['text'] + + # Handle both dict and string inputs + if isinstance(text, dict): + text = text['text'] # Initialize metadata if not present if 'metadata' not in history: @@ -822,9 +825,12 @@ def send_dummy_message(textbox, state): return history -def send_dummy_reply(textbox, state): +def send_dummy_reply(text, state): history = state['history'] - text = textbox['text'] + + # Handle both dict and string inputs + if isinstance(text, dict): + text = text['text'] # Initialize metadata if not present if 'metadata' not in history: From 298d4719c6c9545a701a9cc9e8f4efceb108599a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 30 May 2025 11:32:24 -0700 Subject: [PATCH 2/4] Multiple small style improvements --- css/main.css | 4 ++++ modules/ui.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/css/main.css b/css/main.css index 9685c863..967d94ed 100644 --- a/css/main.css +++ b/css/main.css @@ -1551,3 +1551,7 @@ strong { color: var(--body-text-color-subdued); margin-top: 4px; } + +button:focus { + outline: none; +} diff --git a/modules/ui.py b/modules/ui.py index a2662e14..9f4d67cb 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -71,6 +71,7 @@ if not shared.args.old_colors: block_background_fill_dark='transparent', block_border_color_dark='transparent', input_border_color_dark='var(--border-color-dark)', + input_border_color_focus_dark='var(--border-color-dark)', checkbox_border_color_dark='var(--border-color-dark)', border_color_primary_dark='var(--border-color-dark)', button_secondary_border_color_dark='var(--border-color-dark)', @@ -89,6 +90,8 @@ if not shared.args.old_colors: checkbox_label_shadow='none', block_shadow='none', block_shadow_dark='none', + input_shadow_focus='none', + input_shadow_focus_dark='none', button_large_radius='0.375rem', button_large_padding='6px 12px', input_radius='0.375rem', From 219f0a773166deeb0326c2874b29e66e382df524 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 30 May 2025 12:05:49 -0700 Subject: [PATCH 3/4] Fix exllamav3_hf models failing to unload (closes #7031) --- modules/exllamav3_hf.py | 17 +++++++++++++++++ modules/models.py | 3 +++ 2 files changed, 20 insertions(+) diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py index 417df473..1254ff5d 100644 --- a/modules/exllamav3_hf.py +++ b/modules/exllamav3_hf.py @@ -245,3 +245,20 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path) return Exllamav3HF(pretrained_model_name_or_path) + + def unload(self): + """Properly unload the ExllamaV3 model and free GPU memory.""" + if hasattr(self, 'ex_model') and self.ex_model is not None: + self.ex_model.unload() + self.ex_model = None + + if hasattr(self, 'ex_cache') and self.ex_cache is not None: + self.ex_cache = None + + # Clean up any additional ExllamaV3 resources + if hasattr(self, 'past_seq'): + self.past_seq = None + if hasattr(self, 'past_seq_negative'): + self.past_seq_negative = None + if hasattr(self, 'ex_cache_negative'): + self.ex_cache_negative = None diff --git a/modules/models.py b/modules/models.py index 4218d58c..d329ae3c 100644 --- a/modules/models.py +++ b/modules/models.py @@ -116,10 +116,13 @@ def unload_model(keep_model_name=False): return is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer') + if shared.args.loader == 'ExLlamav3_HF': + shared.model.unload() shared.model = shared.tokenizer = None shared.lora_names = [] shared.model_dirty_from_training = False + if not is_llamacpp: from modules.torch_utils import clear_torch_cache clear_torch_cache() From 15f466ca3f8255f2566f016db8d7b8fd9ebef3f4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Fri, 30 May 2025 15:49:57 -0700 Subject: [PATCH 4/4] Update README --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index daf409d0..55df33d2 100644 --- a/README.md +++ b/README.md @@ -14,18 +14,18 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. - Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)). - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory. -- **File attachments**: Upload text files and PDF documents directly in conversations to talk about their contents. -- **Web search**: Optionally search the internet with LLM-generated queries based on your input to add context to the conversation. -- Advanced chat management: Edit messages, navigate between message versions, and branch conversations at any point. +- 100% offline and private, with zero telemetry, external resources, or remote update requests. - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats. -- Automatic GPU layers for GGUF models (on NVIDIA GPUs). -- UI that resembles the original ChatGPT style. -- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`. -- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these. +- **File attachments**: Upload text files and PDF documents to talk about their contents. +- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation. +- Aesthetic UI with dark and light themes. +- `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters. +- Edit messages, navigate between message versions, and branch conversations at any point. - Multiple sampling parameters and generation options for sophisticated text generation control. -- Switch between different models easily in the UI without restarting, with fine control over settings. +- Switch between different models in the UI without restarting. +- Automatic GPU layers for GGUF models (on NVIDIA GPUs). +- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. - OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples). -- 100% offline and private, with zero telemetry, external resources, or remote update requests. Web search is optional and user-controlled. - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details. ## How to install