diff --git a/README.md b/README.md
index 7105ce23..16b02539 100644
--- a/README.md
+++ b/README.md
@@ -13,15 +13,19 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
## Features
- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
-- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
-- UI that resembles the original ChatGPT style.
-- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
-- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
-- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
-- Multiple sampling parameters and generation options for sophisticated text generation control.
-- Switch between different models easily in the UI without restarting, with fine control over settings.
-- OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
+- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
- 100% offline and private, with zero telemetry, external resources, or remote update requests.
+- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
+- **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
+- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
+- Aesthetic UI with dark and light themes.
+- `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters.
+- Edit messages, navigate between message versions, and branch conversations at any point.
+- Multiple sampling parameters and generation options for sophisticated text generation control.
+- Switch between different models in the UI without restarting.
+- Automatic GPU layers for GGUF models (on NVIDIA GPUs).
+- Free-form text generation in the Default/Notebook tabs without being limited to chat turns.
+- OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
- Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
## How to install
@@ -185,13 +189,13 @@ usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--
[--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
[--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit]
[--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap]
- [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N]
+ [--mlock] [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N]
[--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
- [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner]
- [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
- [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
- [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--api] [--public-api]
- [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
+ [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
+ [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
+ [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
+ [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY]
+ [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
Text generation web UI
@@ -213,7 +217,7 @@ Basic settings:
--idle-timeout IDLE_TIMEOUT Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
Model loader:
- --loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ,
+ --loader LOADER Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2,
TensorRT-LLM.
Transformers/Accelerate:
@@ -244,16 +248,18 @@ llama.cpp:
--batch-size BATCH_SIZE Maximum number of prompt tokens to batch together when calling llama_eval.
--no-mmap Prevent mmap from being used.
--mlock Force the system to keep the model in RAM.
- --n-gpu-layers N_GPU_LAYERS Number of layers to offload to the GPU.
+ --gpu-layers N, --n-gpu-layers N Number of layers to offload to the GPU.
--tensor-split TENSOR_SPLIT Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
--numa Activate NUMA task allocation for llama.cpp.
--no-kv-offload Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
--row-split Split the model by rows across GPUs. This may improve multi-gpu performance.
- --extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"
+ --extra-flags EXTRA_FLAGS Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"
--streaming-llm Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
-Context and cache management:
+Context and cache:
--ctx-size N, --n_ctx N, --max_seq_len N Context size in tokens.
+ --cache-type N, --cache_type N KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits
+ separately, e.g. q4_q8).
Speculative decoding:
--model-draft MODEL_DRAFT Path to the draft model for speculative decoding.
@@ -272,15 +278,9 @@ ExLlamaV2:
--num_experts_per_token N Number of experts to use for generation. Applies to MoE models like Mixtral.
--enable_tp Enable Tensor Parallelism (TP) in ExLlamaV2.
-HQQ:
- --hqq-backend HQQ_BACKEND Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
-
TensorRT-LLM:
--cpp-runner Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
-Cache:
- --cache_type CACHE_TYPE KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
-
DeepSpeed:
--deepspeed Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
--nvme-offload-dir NVME_OFFLOAD_DIR DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
@@ -303,6 +303,7 @@ Gradio:
--ssl-certfile SSL_CERTFILE The path to the SSL certificate cert file.
--subpath SUBPATH Customize the subpath for gradio, use with reverse proxy
--old-colors Use the legacy Gradio colors, before the December/2024 update.
+ --portable Hide features not available in portable mode like training.
API:
--api Enable the API extension.
diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 6ad250aa..9831ee8f 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -17,6 +17,14 @@
color: #d1d5db !important;
}
+.chat .message-body :is(th, td) {
+ border-color: #40404096 !important;
+}
+
+.dark .chat .message-body :is(th, td) {
+ border-color: #ffffff75 !important;
+}
+
.chat .message-body :is(p, ul, ol) {
margin: 1.25em 0 !important;
}
diff --git a/css/main.css b/css/main.css
index 10089b1d..026ea6c8 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1,11 +1,11 @@
:root {
--darker-gray: #202123;
- --dark-gray: #343541;
- --light-gray: #444654;
+ --dark-gray: #2A2B32;
+ --light-gray: #373943;
--light-theme-gray: #f9fbff;
--border-color-dark: #525252;
--header-width: 112px;
- --selected-item-color-dark: #32333e;
+ --selected-item-color-dark: #2E2F38;
}
@font-face {
@@ -265,7 +265,7 @@ button {
.dark .pretty_scrollbar::-webkit-scrollbar-thumb,
.dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
- background: rgba(255, 255, 255, 0.2);
+ background: rgb(255 255 255 / 10%);
border-radius: 10px;
}
@@ -582,7 +582,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
#chat-input {
padding: 0;
- padding-top: 18px;
background: transparent;
border: none;
}
@@ -661,37 +660,12 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
}
-#show-controls {
- position: absolute;
- background-color: transparent;
- border: 0 !important;
- border-radius: 0;
-}
-
-#show-controls label {
- z-index: 1000;
- position: absolute;
- right: 30px;
- top: 10px;
- white-space: nowrap;
- overflow: hidden;
- text-overflow: ellipsis;
-}
-
-.dark #show-controls span {
- color: var(--neutral-400);
-}
-
-#show-controls span {
- color: var(--neutral-600);
-}
-
#typing-container {
display: none;
position: absolute;
background-color: transparent;
left: -2px;
- top: 4px;
+ top: -5px;
padding: var(--block-padding);
}
@@ -785,6 +759,33 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
background: var(--selected-item-color-dark) !important;
}
+#show-controls {
+ height: 36px;
+ border-top: 1px solid var(--border-color-dark) !important;
+ border-left: 1px solid var(--border-color-dark) !important;
+ border-right: 1px solid var(--border-color-dark) !important;
+ border-radius: 0;
+ border-bottom: 0 !important;
+ background-color: var(--darker-gray);
+ padding-top: 3px;
+ padding-left: 4px;
+ display: flex;
+}
+
+#show-controls label {
+ display: flex;
+ flex-direction: row-reverse;
+ font-weight: bold;
+ justify-content: start;
+ width: 100%;
+ padding-right: 12px;
+ gap: 10px;
+}
+
+#show-controls label input {
+ margin-top: 4px;
+}
+
.transparent-substring {
opacity: 0.333;
}
@@ -1326,6 +1327,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
overflow: hidden;
}
+.thinking-content:focus, .thinking-header:focus {
+ outline: 0 !important;
+}
+
.dark .thinking-block {
background-color: var(--darker-gray);
}
@@ -1462,6 +1467,7 @@ strong {
.editing-textarea {
width: 100%;
min-height: 200px;
+ max-height: 65vh;
padding: 10px;
border-radius: 5px;
border: 1px solid #ccc;
@@ -1563,3 +1569,12 @@ strong {
border: 2px solid var(--border-color-primary);
aspect-ratio: 1 / 1;
}
+
+button:focus {
+ outline: none;
+}
+
+/* Fix extra gaps for hidden elements on the right sidebar */
+.svelte-sa48pu.stretch:has(> .hidden:only-child) {
+ display: none;
+}
diff --git a/download-model.py b/download-model.py
index 25517491..576a8b79 100644
--- a/download-model.py
+++ b/download-model.py
@@ -32,6 +32,7 @@ class ModelDownloader:
self.max_retries = max_retries
self.session = self.get_session()
self._progress_bar_slots = None
+ self.progress_queue = None
def get_session(self):
session = requests.Session()
@@ -218,33 +219,45 @@ class ModelDownloader:
max_retries = self.max_retries
attempt = 0
+ file_downloaded_count_for_progress = 0
+
try:
while attempt < max_retries:
attempt += 1
session = self.session
headers = {}
mode = 'wb'
+ current_file_size_on_disk = 0
try:
if output_path.exists() and not start_from_scratch:
- # Resume download
- r = session.get(url, stream=True, timeout=20)
- total_size = int(r.headers.get('content-length', 0))
- if output_path.stat().st_size >= total_size:
+ current_file_size_on_disk = output_path.stat().st_size
+ r_head = session.head(url, timeout=20)
+ r_head.raise_for_status()
+ total_size = int(r_head.headers.get('content-length', 0))
+
+ if current_file_size_on_disk >= total_size and total_size > 0:
+ if self.progress_queue is not None and total_size > 0:
+ self.progress_queue.put((1.0, str(filename)))
return
- headers = {'Range': f'bytes={output_path.stat().st_size}-'}
+ headers = {'Range': f'bytes={current_file_size_on_disk}-'}
mode = 'ab'
with session.get(url, stream=True, headers=headers, timeout=30) as r:
- r.raise_for_status() # If status is not 2xx, raise an error
- total_size = int(r.headers.get('content-length', 0))
- block_size = 1024 * 1024 # 1MB
+ r.raise_for_status()
+ total_size_from_stream = int(r.headers.get('content-length', 0))
+ if mode == 'ab':
+ effective_total_size = current_file_size_on_disk + total_size_from_stream
+ else:
+ effective_total_size = total_size_from_stream
- filename_str = str(filename) # Convert PosixPath to string if necessary
+ block_size = 1024 * 1024
+ filename_str = str(filename)
tqdm_kwargs = {
- 'total': total_size,
+ 'total': effective_total_size,
+ 'initial': current_file_size_on_disk if mode == 'ab' else 0,
'unit': 'B',
'unit_scale': True,
'unit_divisor': 1024,
@@ -261,16 +274,20 @@ class ModelDownloader:
})
with open(output_path, mode) as f:
+ if mode == 'ab':
+ f.seek(current_file_size_on_disk)
+
with tqdm.tqdm(**tqdm_kwargs) as t:
- count = 0
+ file_downloaded_count_for_progress = current_file_size_on_disk
for data in r.iter_content(block_size):
f.write(data)
t.update(len(data))
- if total_size != 0 and self.progress_bar is not None:
- count += len(data)
- self.progress_bar(float(count) / float(total_size), f"{filename_str}")
+ if effective_total_size != 0 and self.progress_queue is not None:
+ file_downloaded_count_for_progress += len(data)
+ progress_fraction = float(file_downloaded_count_for_progress) / float(effective_total_size)
+ self.progress_queue.put((progress_fraction, filename_str))
+ break
- break # Exit loop if successful
except (RequestException, ConnectionError, Timeout) as e:
print(f"Error downloading {filename}: {e}.")
print(f"That was attempt {attempt}/{max_retries}.", end=' ')
@@ -295,10 +312,9 @@ class ModelDownloader:
finally:
print(f"\nDownload of {len(file_list)} files to {output_folder} completed.")
- def download_model_files(self, model, branch, links, sha256, output_folder, progress_bar=None, start_from_scratch=False, threads=4, specific_file=None, is_llamacpp=False):
- self.progress_bar = progress_bar
+ def download_model_files(self, model, branch, links, sha256, output_folder, progress_queue=None, start_from_scratch=False, threads=4, specific_file=None, is_llamacpp=False):
+ self.progress_queue = progress_queue
- # Create the folder and writing the metadata
output_folder.mkdir(parents=True, exist_ok=True)
if not is_llamacpp:
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 0e86d450..801f1574 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -95,6 +95,12 @@ function startEditing(messageElement, messageBody, isUserMessage) {
editingInterface.textarea.focus();
editingInterface.textarea.setSelectionRange(rawText.length, rawText.length);
+ // Scroll the textarea into view
+ editingInterface.textarea.scrollIntoView({
+ behavior: "smooth",
+ block: "center"
+ });
+
// Setup event handlers
setupEditingHandlers(editingInterface.textarea, messageElement, originalHTML, messageBody, isUserMessage);
}
@@ -186,31 +192,33 @@ function navigateVersion(element, direction) {
const index = messageElement.getAttribute("data-index");
if (!index) return;
- const indexInput = document.getElementById("Navigate-message-index").querySelector("input");
- if (!indexInput) {
- console.error("Element with ID 'Navigate-message-index' not found.");
- return;
- }
-
- const directionInput = document.getElementById("Navigate-direction").querySelector("textarea");
- if (!directionInput) {
- console.error("Element with ID 'Navigate-direction' not found.");
- return;
+ // Determine role based on message element classes
+ let role = "assistant"; // Default role
+ if (messageElement.classList.contains("user-message") ||
+ messageElement.querySelector(".text-you") ||
+ messageElement.querySelector(".circle-you")) {
+ role = "user";
}
+ const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input");
+ const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea");
+ const roleInput = document.getElementById("Navigate-message-role")?.querySelector("textarea");
const navigateButton = document.getElementById("Navigate-version");
- if (!navigateButton) {
- console.error("Required element 'Navigate-version' not found.");
+
+ if (!indexInput || !directionInput || !roleInput || !navigateButton) {
+ console.error("Navigation control elements (index, direction, role, or button) not found.");
return;
}
indexInput.value = index;
directionInput.value = direction;
+ roleInput.value = role;
- // Trigger any 'change' or 'input' events Gradio might be listening for
+ // Trigger 'input' events for Gradio to pick up changes
const event = new Event("input", { bubbles: true });
indexInput.dispatchEvent(event);
directionInput.dispatchEvent(event);
+ roleInput.dispatchEvent(event);
navigateButton.click();
}
@@ -227,10 +235,23 @@ function removeLastClick() {
document.getElementById("Remove-last").click();
}
-function handleMorphdomUpdate(text) {
+function handleMorphdomUpdate(data) {
+ // Determine target element and use it as query scope
+ var target_element, target_html;
+ if (data.last_message_only) {
+ const childNodes = document.getElementsByClassName("messages")[0].childNodes;
+ target_element = childNodes[childNodes.length - 1];
+ target_html = data.html;
+ } else {
+ target_element = document.getElementById("chat").parentNode;
+ target_html = "
'
- f'{copy_button}'
- f'{edit_button}'
- f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
- f'{continue_button if i == len(history["visible"]) - 1 else ""}'
- f'{remove_button if i == len(history["visible"]) - 1 else ""}'
- f'{branch_button}'
- f'{info_message}'
- f'
'
- f'{get_version_navigation_html(history, i)}')
- return (f''
+def generate_chat_html(history, name1, name2, reset_cache=False, last_message_only=False):
+ if not last_message_only:
+ output = f'
'
+ else:
+ output = ""
- for i in range(len(history['visible'])):
- row_visible = history['visible'][i]
- row_internal = history['internal'][i]
- converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
+ def create_message(role, content, raw_content):
+ """Inner function for WPP-style messages."""
+ text_class = "text-you" if role == "user" else "text-bot"
- # Get timestamps
- user_timestamp = format_message_timestamp(history, "user", i)
- assistant_timestamp = format_message_timestamp(history, "assistant", i)
+ # Get role-specific data
+ timestamp = format_message_timestamp(history, role, i)
+ attachments = format_message_attachments(history, role, i)
- # Get attachments
- user_attachments = format_message_attachments(history, "user", i)
- assistant_attachments = format_message_attachments(history, "assistant", i)
+ # Create info button if timestamp exists
+ info_message = ""
+ if timestamp:
+ tooltip_text = get_message_tooltip(history, role, i)
+ info_message = info_button.replace('title="message"', f'title="{html.escape(tooltip_text)}"')
- # Create info buttons for timestamps if they exist
- info_message_user = ""
- if user_timestamp != "":
- # Extract the timestamp value from the span
- user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0]
- info_message_user = info_button.replace("message", user_timestamp_value)
-
- info_message_assistant = ""
- if assistant_timestamp != "":
- # Extract the timestamp value from the span
- assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0]
- info_message_assistant = info_button.replace("message", assistant_timestamp_value)
-
- if converted_visible[0]: # Don't display empty user messages
- output += (
- f'
'
- f'
'
- f'
{converted_visible[0]}
'
- f'{user_attachments}'
- f'{actions_html(history, i, "user", info_message_user)}'
- f'
'
- f'
'
- )
-
- output += (
+ return (
f'
'
- f'
'
- f'
{converted_visible[1]}
'
- f'{assistant_attachments}'
- f'{actions_html(history, i, "assistant", info_message_assistant)}'
+ f'
'
+ f'
{content}
'
+ f'{attachments}'
+ f'{actions_html(history, i, role, info_message)}'
f'
'
f'
'
)
- output += "
"
+ # Determine range
+ start_idx = len(history['visible']) - 1 if last_message_only else 0
+ end_idx = len(history['visible'])
+
+ for i in range(start_idx, end_idx):
+ row_visible = history['visible'][i]
+ row_internal = history['internal'][i]
+
+ # Convert content
+ if last_message_only:
+ converted_visible = [None, convert_to_markdown_wrapped(row_visible[1], message_id=i, use_cache=i != len(history['visible']) - 1)]
+ else:
+ converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
+
+ # Generate messages
+ if not last_message_only and converted_visible[0]:
+ output += create_message("user", converted_visible[0], row_internal[0])
+
+ output += create_message("assistant", converted_visible[1], row_internal[1])
+
+ if not last_message_only:
+ output += "
"
+
return output
@@ -628,15 +667,15 @@ def time_greeting():
return "Good evening!"
-def chat_html_wrapper(history, name1, name2, mode, style, character, reset_cache=False):
+def chat_html_wrapper(history, name1, name2, mode, style, character, reset_cache=False, last_message_only=False):
if len(history['visible']) == 0:
greeting = f"
{time_greeting()} How can I help you today?
"
result = f'
{greeting}
'
elif mode == 'instruct':
- result = generate_instruct_html(history)
+ result = generate_instruct_html(history, last_message_only=last_message_only)
elif style == 'wpp':
- result = generate_chat_html(history, name1, name2)
+ result = generate_chat_html(history, name1, name2, last_message_only=last_message_only)
else:
- result = generate_cai_chat_html(history, name1, name2, style, character, reset_cache)
+ result = generate_cai_chat_html(history, name1, name2, style, character, reset_cache=reset_cache, last_message_only=last_message_only)
- return {'html': result}
+ return {'html': result, 'last_message_only': last_message_only}
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index ee63262e..8b4ed7a7 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -421,14 +421,31 @@ class LlamaServer:
def filter_stderr_with_progress(process_stderr):
progress_pattern = re.compile(r'slot update_slots: id.*progress = (\d+\.\d+)')
+ last_was_progress = False
+
try:
for line in iter(process_stderr.readline, ''):
+ line = line.rstrip('\n\r') # Remove existing newlines
progress_match = progress_pattern.search(line)
+
if progress_match:
- sys.stderr.write(line)
+ if last_was_progress:
+ # Overwrite the previous progress line using carriage return
+ sys.stderr.write(f'\r{line}')
+ else:
+ # First progress line - print normally
+ sys.stderr.write(line)
sys.stderr.flush()
+ last_was_progress = True
elif not line.startswith(('srv ', 'slot ')) and 'log_server_r: request: GET /health' not in line:
- sys.stderr.write(line)
+ if last_was_progress:
+ # Finish the progress line with a newline, then print the new line
+ sys.stderr.write(f'\n{line}\n')
+ else:
+ # Normal line - print with newline
+ sys.stderr.write(f'{line}\n')
sys.stderr.flush()
+ last_was_progress = False
+ # For filtered lines, don't change last_was_progress state
except (ValueError, IOError):
pass
diff --git a/modules/models.py b/modules/models.py
index 4218d58c..c1e7fb56 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -116,10 +116,13 @@ def unload_model(keep_model_name=False):
return
is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
+ if shared.model.__class__.__name__ == 'Exllamav3HF':
+ shared.model.unload()
shared.model = shared.tokenizer = None
shared.lora_names = []
shared.model_dirty_from_training = False
+
if not is_llamacpp:
from modules.torch_utils import clear_torch_cache
clear_torch_cache()
diff --git a/modules/shared.py b/modules/shared.py
index d2305f30..f712f7f8 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -21,7 +21,7 @@ lora_names = []
# Generation variables
stop_everything = False
generation_lock = None
-processing_message = '*Is typing...*'
+processing_message = ''
# UI variables
gradio = {}
@@ -47,7 +47,6 @@ settings = {
'max_new_tokens_max': 4096,
'prompt_lookup_num_tokens': 0,
'max_tokens_second': 0,
- 'max_updates_second': 12,
'auto_max_new_tokens': True,
'ban_eos_token': False,
'add_bos_token': True,
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 962311df..0d499d50 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -65,41 +65,39 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
all_stop_strings += st
shared.stop_everything = False
- last_update = -1
reply = ''
is_stream = state['stream']
if len(all_stop_strings) > 0 and not state['stream']:
state = copy.deepcopy(state)
state['stream'] = True
- min_update_interval = 0
- if state.get('max_updates_second', 0) > 0:
- min_update_interval = 1 / state['max_updates_second']
-
# Generate
+ last_update = -1
+ latency_threshold = 1 / 1000
for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat):
+ cur_time = time.monotonic()
reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
if escape_html:
reply = html.escape(reply)
if is_stream:
- cur_time = time.time()
-
# Limit number of tokens/second to make text readable in real time
if state['max_tokens_second'] > 0:
diff = 1 / state['max_tokens_second'] - (cur_time - last_update)
if diff > 0:
time.sleep(diff)
- last_update = time.time()
+ last_update = time.monotonic()
yield reply
# Limit updates to avoid lag in the Gradio UI
# API updates are not limited
else:
- if cur_time - last_update > min_update_interval:
- last_update = cur_time
+ # If 'generate_func' takes less than 0.001 seconds to yield the next token
+ # (equivalent to more than 1000 tok/s), assume that the UI is lagging behind and skip yielding
+ if (cur_time - last_update) > latency_threshold:
yield reply
+ last_update = time.monotonic()
if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything):
break
@@ -505,11 +503,11 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
return
-def print_prompt(prompt, max_chars=2000):
+def print_prompt(prompt, max_chars=-1):
DARK_YELLOW = "\033[38;5;3m"
RESET = "\033[0m"
- if len(prompt) > max_chars:
+ if max_chars > 0 and len(prompt) > max_chars:
half_chars = max_chars // 2
hidden_len = len(prompt[half_chars:-half_chars])
hidden_msg = f"{DARK_YELLOW}[...{hidden_len} characters hidden...]{RESET}"
diff --git a/modules/ui.py b/modules/ui.py
index e24e6402..14a09d2b 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -6,6 +6,7 @@ import yaml
import extensions
from modules import shared
+from modules.chat import load_history
with open(Path(__file__).resolve().parent / '../css/NotoSans/stylesheet.css', 'r') as f:
css = f.read()
@@ -71,6 +72,7 @@ if not shared.args.old_colors:
block_background_fill_dark='transparent',
block_border_color_dark='transparent',
input_border_color_dark='var(--border-color-dark)',
+ input_border_color_focus_dark='var(--border-color-dark)',
checkbox_border_color_dark='var(--border-color-dark)',
border_color_primary_dark='var(--border-color-dark)',
button_secondary_border_color_dark='var(--border-color-dark)',
@@ -89,6 +91,8 @@ if not shared.args.old_colors:
checkbox_label_shadow='none',
block_shadow='none',
block_shadow_dark='none',
+ input_shadow_focus='none',
+ input_shadow_focus_dark='none',
button_large_radius='0.375rem',
button_large_padding='6px 12px',
input_radius='0.375rem',
@@ -191,7 +195,6 @@ def list_interface_input_elements():
'max_new_tokens',
'prompt_lookup_num_tokens',
'max_tokens_second',
- 'max_updates_second',
'do_sample',
'dynamic_temperature',
'temperature_last',
@@ -212,14 +215,13 @@ def list_interface_input_elements():
'grammar_string',
'navigate_message_index',
'navigate_direction',
+ 'navigate_message_role',
'edit_message_index',
'edit_message_text',
'edit_message_role',
'branch_index',
'enable_web_search',
'web_search_pages',
- 'navigate_message_index',
- 'navigate_direction',
]
# Chat elements
@@ -268,6 +270,10 @@ def gather_interface_values(*args):
if not shared.args.multi_user:
shared.persistent_interface_state = output
+ # Prevent history loss if backend is restarted but UI is not refreshed
+ if output['history'] is None and output['unique_id'] is not None:
+ output['history'] = load_history(output['unique_id'], output['character_menu'], output['mode'])
+
return output
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 719af85a..0d5a2c18 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -18,7 +18,7 @@ def create_ui():
mu = shared.args.multi_user
shared.gradio['Chat input'] = gr.State()
- shared.gradio['history'] = gr.JSON(visible=False)
+ shared.gradio['history'] = gr.State({'internal': [], 'visible': [], 'metadata': {}})
with gr.Tab('Chat', id='Chat', elem_id='chat-tab'):
with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
@@ -55,7 +55,6 @@ def create_ui():
with gr.Column(scale=10, elem_id='chat-input-container'):
shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar'])
- shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls')
shared.gradio['typing-dots'] = gr.HTML(value='
', label='typing', elem_id='typing-container')
with gr.Column(scale=1, elem_id='generate-stop-container'):
@@ -65,21 +64,15 @@ def create_ui():
# Hover menu buttons
with gr.Column(elem_id='chat-buttons'):
- with gr.Row():
- shared.gradio['Regenerate'] = gr.Button('Regenerate (Ctrl + Enter)', elem_id='Regenerate')
- shared.gradio['Continue'] = gr.Button('Continue (Alt + Enter)', elem_id='Continue')
- shared.gradio['Remove last'] = gr.Button('Remove last reply (Ctrl + Shift + Backspace)', elem_id='Remove-last')
-
- with gr.Row():
- shared.gradio['Impersonate'] = gr.Button('Impersonate (Ctrl + Shift + M)', elem_id='Impersonate')
-
- with gr.Row():
- shared.gradio['Send dummy message'] = gr.Button('Send dummy message')
- shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
-
- with gr.Row():
- shared.gradio['send-chat-to-default'] = gr.Button('Send to Default')
- shared.gradio['send-chat-to-notebook'] = gr.Button('Send to Notebook')
+ shared.gradio['Regenerate'] = gr.Button('Regenerate (Ctrl + Enter)', elem_id='Regenerate')
+ shared.gradio['Continue'] = gr.Button('Continue (Alt + Enter)', elem_id='Continue')
+ shared.gradio['Remove last'] = gr.Button('Remove last reply (Ctrl + Shift + Backspace)', elem_id='Remove-last')
+ shared.gradio['Impersonate'] = gr.Button('Impersonate (Ctrl + Shift + M)', elem_id='Impersonate')
+ shared.gradio['Send dummy message'] = gr.Button('Send dummy message')
+ shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
+ shared.gradio['send-chat-to-default'] = gr.Button('Send to Default')
+ shared.gradio['send-chat-to-notebook'] = gr.Button('Send to Notebook')
+ shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls')
with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']):
with gr.Column():
@@ -87,7 +80,7 @@ def create_ui():
shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
with gr.Row():
- shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search')
+ shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search', elem_id='web-search')
with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']:
shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10)
@@ -110,6 +103,7 @@ def create_ui():
with gr.Row(visible=False):
shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index")
shared.gradio['navigate_direction'] = gr.Textbox(value="", elem_id="Navigate-direction")
+ shared.gradio['navigate_message_role'] = gr.Textbox(value="", elem_id="Navigate-message-role")
shared.gradio['navigate_version'] = gr.Button(elem_id="Navigate-version")
shared.gradio['edit_message_index'] = gr.Number(value=-1, precision=0, elem_id="Edit-message-index")
shared.gradio['edit_message_text'] = gr.Textbox(value="", elem_id="Edit-message-text")
@@ -201,7 +195,7 @@ def create_event_handlers():
shared.reload_inputs = gradio(reload_arr)
# Morph HTML updates instead of updating everything
- shared.gradio['display'].change(None, gradio('display'), None, js="(data) => handleMorphdomUpdate(data.html)")
+ shared.gradio['display'].change(None, gradio('display'), None, js="(data) => handleMorphdomUpdate(data)")
shared.gradio['Generate'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
@@ -313,8 +307,7 @@ def create_event_handlers():
shared.gradio['edit_message'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
- chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False).then(
- lambda: None, None, None, js='() => { const role = document.getElementById("Edit-message-role").querySelector("textarea").value; if (role === "user") document.getElementById("Regenerate").click(); }')
+ chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False)
# Save/delete a character
shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False)
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index 862b3893..2a7d3d9d 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -1,4 +1,6 @@
import importlib
+import queue
+import threading
import traceback
from functools import partial
from pathlib import Path
@@ -205,48 +207,51 @@ def load_lora_wrapper(selected_loras):
def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):
+ downloader_module = importlib.import_module("download-model")
+ downloader = downloader_module.ModelDownloader()
+ update_queue = queue.Queue()
+
try:
# Handle direct GGUF URLs
if repo_id.startswith("https://") and ("huggingface.co" in repo_id) and (repo_id.endswith(".gguf") or repo_id.endswith(".gguf?download=true")):
try:
path = repo_id.split("huggingface.co/")[1]
-
- # Extract the repository ID (first two parts of the path)
parts = path.split("/")
if len(parts) >= 2:
extracted_repo_id = f"{parts[0]}/{parts[1]}"
-
- # Extract the filename (last part of the path)
- filename = repo_id.split("/")[-1]
- if "?download=true" in filename:
- filename = filename.replace("?download=true", "")
-
+ filename = repo_id.split("/")[-1].replace("?download=true", "")
repo_id = extracted_repo_id
specific_file = filename
- except:
- pass
+ except Exception as e:
+ yield f"Error parsing GGUF URL: {e}"
+ progress(0.0)
+ return
- if repo_id == "":
- yield ("Please enter a model path")
+ if not repo_id:
+ yield "Please enter a model path."
+ progress(0.0)
return
repo_id = repo_id.strip()
specific_file = specific_file.strip()
- downloader = importlib.import_module("download-model").ModelDownloader()
- progress(0.0)
+ progress(0.0, "Preparing download...")
+
model, branch = downloader.sanitize_model_and_branch_names(repo_id, None)
-
- yield ("Getting the download links from Hugging Face")
+ yield "Getting download links from Hugging Face..."
links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file)
+ if not links:
+ yield "No files found to download for the given model/criteria."
+ progress(0.0)
+ return
+
# Check for multiple GGUF files
gguf_files = [link for link in links if link.lower().endswith('.gguf')]
if len(gguf_files) > 1 and not specific_file:
output = "Multiple GGUF files found. Please copy one of the following filenames to the 'File name' field:\n\n```\n"
for link in gguf_files:
output += f"{Path(link).name}\n"
-
output += "```"
yield output
return
@@ -255,17 +260,13 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
output = "```\n"
for link in links:
output += f"{Path(link).name}" + "\n"
-
output += "```"
yield output
return
- yield ("Getting the output folder")
+ yield "Determining output folder..."
output_folder = downloader.get_output_folder(
- model,
- branch,
- is_lora,
- is_llamacpp=is_llamacpp,
+ model, branch, is_lora, is_llamacpp=is_llamacpp,
model_dir=shared.args.model_dir if shared.args.model_dir != shared.args_defaults.model_dir else None
)
@@ -275,19 +276,65 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
output_folder = Path(shared.args.lora_dir)
if check:
- progress(0.5)
-
- yield ("Checking previously downloaded files")
+ yield "Checking previously downloaded files..."
+ progress(0.5, "Verifying files...")
downloader.check_model_files(model, branch, links, sha256, output_folder)
- progress(1.0)
- else:
- yield (f"Downloading file{'s' if len(links) > 1 else ''} to `{output_folder}/`")
- downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=4, is_llamacpp=is_llamacpp)
+ progress(1.0, "Verification complete.")
+ yield "File check complete."
+ return
- yield (f"Model successfully saved to `{output_folder}/`.")
- except:
- progress(1.0)
- yield traceback.format_exc().replace('\n', '\n\n')
+ yield ""
+ progress(0.0, "Download starting...")
+
+ def downloader_thread_target():
+ try:
+ downloader.download_model_files(
+ model, branch, links, sha256, output_folder,
+ progress_queue=update_queue,
+ threads=4,
+ is_llamacpp=is_llamacpp,
+ specific_file=specific_file
+ )
+ update_queue.put(("COMPLETED", f"Model successfully saved to `{output_folder}/`."))
+ except Exception as e:
+ tb_str = traceback.format_exc().replace('\n', '\n\n')
+ update_queue.put(("ERROR", tb_str))
+
+ download_thread = threading.Thread(target=downloader_thread_target)
+ download_thread.start()
+
+ while True:
+ try:
+ message = update_queue.get(timeout=0.2)
+ if not isinstance(message, tuple) or len(message) != 2:
+ continue
+
+ msg_identifier, data = message
+
+ if msg_identifier == "COMPLETED":
+ progress(1.0, "Download complete!")
+ yield data
+ break
+ elif msg_identifier == "ERROR":
+ progress(0.0, "Error occurred")
+ yield data
+ break
+ elif isinstance(msg_identifier, float):
+ progress_value = msg_identifier
+ description_str = data
+ progress(progress_value, f"Downloading: {description_str}")
+
+ except queue.Empty:
+ if not download_thread.is_alive():
+ yield "Download process finished."
+ break
+
+ download_thread.join()
+
+ except Exception as e:
+ progress(0.0)
+ tb_str = traceback.format_exc().replace('\n', '\n\n')
+ yield tb_str
def update_truncation_length(current_length, state):
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 733d0901..84f9fbfc 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -71,8 +71,6 @@ def create_ui(default_preset):
shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label='max_new_tokens', info='⚠️ Setting this too high can cause prompt truncation.')
shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
- shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')
-
with gr.Column():
with gr.Row():
with gr.Column():
diff --git a/modules/utils.py b/modules/utils.py
index 0e8bdd18..577c55b8 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -74,7 +74,7 @@ def natural_keys(text):
def check_model_loaded():
if shared.model_name == 'None' or shared.model is None:
- if len(get_available_models()) <= 1:
+ if len(get_available_models()) == 0:
error_msg = "No model is loaded.\n\nTo get started:\n1) Place a GGUF file in your user_data/models folder\n2) Go to the Model tab and select it"
logger.error(error_msg)
return False, error_msg
diff --git a/modules/web_search.py b/modules/web_search.py
index d3387ac9..1f670349 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -1,3 +1,5 @@
+import concurrent.futures
+from concurrent.futures import as_completed
from datetime import datetime
import requests
@@ -5,7 +7,6 @@ from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from modules.logging_colors import logger
-from modules.text_generation import generate_reply
def get_current_timestamp():
@@ -13,23 +14,7 @@ def get_current_timestamp():
return datetime.now().strftime('%b %d, %Y %H:%M')
-def generate_search_query(user_message, state):
- """Generate a search query from user message using the LLM"""
- search_prompt = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else."
-
- # Use a minimal state for search query generation
- search_state = state.copy()
- search_state['max_new_tokens'] = 64
- search_state['temperature'] = 0.1
-
- query = ""
- for reply in generate_reply(search_prompt, search_state, stopping_strings=[], is_chat=False):
- query = reply.strip()
-
- return query
-
-
-def download_web_page(url, timeout=10):
+def download_web_page(url, timeout=5):
"""Download and extract text from a web page"""
try:
headers = {
@@ -56,45 +41,63 @@ def download_web_page(url, timeout=10):
return f"[Error downloading content from {url}: {str(e)}]"
-def perform_web_search(query, num_pages=3):
+def perform_web_search(query, num_pages=3, max_workers=5):
"""Perform web search and return results with content"""
try:
with DDGS() as ddgs:
results = list(ddgs.text(query, max_results=num_pages))
- search_results = []
+ # Prepare download tasks
+ download_tasks = []
for i, result in enumerate(results):
url = result.get('href', '')
title = result.get('title', f'Search Result {i+1}')
+ download_tasks.append((url, title, i))
- # Download page content
- content = download_web_page(url)
+ search_results = [None] * len(download_tasks) # Pre-allocate to maintain order
- search_results.append({
- 'title': title,
- 'url': url,
- 'content': content
- })
+ # Download pages in parallel
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+ # Submit all download tasks
+ future_to_task = {
+ executor.submit(download_web_page, task[0]): task
+ for task in download_tasks
+ }
+
+ # Collect results as they complete
+ for future in as_completed(future_to_task):
+ url, title, index = future_to_task[future]
+ try:
+ content = future.result()
+ search_results[index] = {
+ 'title': title,
+ 'url': url,
+ 'content': content
+ }
+ except Exception as e:
+ logger.error(f"Error downloading {url}: {e}")
+ # Include failed downloads with empty content
+ search_results[index] = {
+ 'title': title,
+ 'url': url,
+ 'content': ''
+ }
return search_results
+
except Exception as e:
logger.error(f"Error performing web search: {e}")
return []
-def add_web_search_attachments(history, row_idx, user_message, state):
+def add_web_search_attachments(history, row_idx, user_message, search_query, state):
"""Perform web search and add results as attachments"""
- if not state.get('enable_web_search', False):
+ if not search_query:
+ logger.warning("No search query provided")
return
try:
- # Generate search query
- search_query = generate_search_query(user_message, state)
- if not search_query:
- logger.warning("Failed to generate search query")
- return
-
- logger.info(f"Generated search query: {search_query}")
+ logger.info(f"Using search query: {search_query}")
# Perform web search
num_pages = int(state.get('web_search_pages', 3))
diff --git a/one_click.py b/one_click.py
index 482a6aa9..cccb0dc9 100644
--- a/one_click.py
+++ b/one_click.py
@@ -70,12 +70,8 @@ def is_installed():
def cpu_has_avx2():
try:
import cpuinfo
-
info = cpuinfo.get_cpu_info()
- if 'avx2' in info['flags']:
- return True
- else:
- return False
+ return 'avx2' in info['flags']
except:
return True
@@ -83,30 +79,112 @@ def cpu_has_avx2():
def cpu_has_amx():
try:
import cpuinfo
-
info = cpuinfo.get_cpu_info()
- if 'amx' in info['flags']:
- return True
- else:
- return False
+ return 'amx' in info['flags']
except:
return True
-def torch_version():
- site_packages_path = None
- for sitedir in site.getsitepackages():
- if "site-packages" in sitedir and conda_env_path in sitedir:
- site_packages_path = sitedir
- break
+def load_state():
+ """Load installer state from JSON file"""
+ if os.path.exists(state_file):
+ try:
+ with open(state_file, 'r') as f:
+ return json.load(f)
+ except:
+ return {}
+ return {}
- if site_packages_path:
- torch_version_file = open(os.path.join(site_packages_path, 'torch', 'version.py')).read().splitlines()
- torver = [line for line in torch_version_file if line.startswith('__version__')][0].split('__version__ = ')[1].strip("'")
+
+def save_state(state):
+ """Save installer state to JSON file"""
+ with open(state_file, 'w') as f:
+ json.dump(state, f)
+
+
+def get_gpu_choice():
+ """Get GPU choice from state file or ask user"""
+ state = load_state()
+ gpu_choice = state.get('gpu_choice')
+
+ if not gpu_choice:
+ if "GPU_CHOICE" in os.environ:
+ choice = os.environ["GPU_CHOICE"].upper()
+ print_big_message(f"Selected GPU choice \"{choice}\" based on the GPU_CHOICE environment variable.")
+ else:
+ choice = get_user_choice(
+ "What is your GPU?",
+ {
+ 'A': 'NVIDIA - CUDA 12.4',
+ 'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4',
+ 'C': 'Apple M Series',
+ 'D': 'Intel Arc (beta)',
+ 'N': 'CPU mode'
+ },
+ )
+
+ # Convert choice to GPU name
+ gpu_choice = {"A": "NVIDIA", "B": "AMD", "C": "APPLE", "D": "INTEL", "N": "NONE"}[choice]
+
+ # Save choice to state
+ state['gpu_choice'] = gpu_choice
+ save_state(state)
+
+ return gpu_choice
+
+
+def get_pytorch_install_command(gpu_choice):
+ """Get PyTorch installation command based on GPU choice"""
+ base_cmd = f"python -m pip install torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION} "
+
+ if gpu_choice == "NVIDIA":
+ return base_cmd + "--index-url https://download.pytorch.org/whl/cu124"
+ elif gpu_choice == "AMD":
+ return base_cmd + "--index-url https://download.pytorch.org/whl/rocm6.2.4"
+ elif gpu_choice in ["APPLE", "NONE"]:
+ return base_cmd + "--index-url https://download.pytorch.org/whl/cpu"
+ elif gpu_choice == "INTEL":
+ if is_linux():
+ return "python -m pip install torch==2.1.0a0 torchvision==0.16.0a0 torchaudio==2.1.0a0 intel-extension-for-pytorch==2.1.10+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
+ else:
+ return "python -m pip install torch==2.1.0a0 torchvision==0.16.0a0 torchaudio==2.1.0a0 intel-extension-for-pytorch==2.1.10 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
else:
- from torch import __version__ as torver
+ return base_cmd
- return torver
+
+def get_pytorch_update_command(gpu_choice):
+ """Get PyTorch update command based on GPU choice"""
+ base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}"
+
+ if gpu_choice == "NVIDIA":
+ return f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
+ elif gpu_choice == "AMD":
+ return f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4"
+ elif gpu_choice in ["APPLE", "NONE"]:
+ return f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
+ elif gpu_choice == "INTEL":
+ intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
+ return f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
+ else:
+ return base_cmd
+
+
+def get_requirements_file(gpu_choice):
+ """Get requirements file path based on GPU choice"""
+ requirements_base = os.path.join("requirements", "full")
+
+ if gpu_choice == "AMD":
+ file_name = f"requirements_amd{'_noavx2' if not cpu_has_avx2() else ''}.txt"
+ elif gpu_choice == "APPLE":
+ file_name = f"requirements_apple_{'intel' if is_x86_64() else 'silicon'}.txt"
+ elif gpu_choice in ["INTEL", "NONE"]:
+ file_name = f"requirements_cpu_only{'_noavx2' if not cpu_has_avx2() else ''}.txt"
+ elif gpu_choice == "NVIDIA":
+ file_name = f"requirements{'_noavx2' if not cpu_has_avx2() else ''}.txt"
+ else:
+ raise ValueError(f"Unknown GPU choice: {gpu_choice}")
+
+ return os.path.join(requirements_base, file_name)
def get_current_commit():
@@ -209,28 +287,8 @@ def get_user_choice(question, options_dict):
def update_pytorch_and_python():
print_big_message("Checking for PyTorch updates.")
-
- # Update the Python version. Left here for future reference in case this becomes necessary.
- # print_big_message("Checking for PyTorch and Python updates.")
- # current_python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
- # if current_python_version != PYTHON_VERSION:
- # run_cmd(f"conda install -y python={PYTHON_VERSION}", assert_success=True, environment=True)
-
- torver = torch_version()
- base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}"
-
- if "+cu" in torver:
- install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
- elif "+rocm" in torver:
- install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4"
- elif "+cpu" in torver:
- install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
- elif "+cxx11" in torver:
- intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
- install_cmd = f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
- else:
- install_cmd = base_cmd
-
+ gpu_choice = get_gpu_choice()
+ install_cmd = get_pytorch_update_command(gpu_choice)
run_cmd(install_cmd, assert_success=True, environment=True)
@@ -256,43 +314,11 @@ def install_webui():
if os.path.isfile(state_file):
os.remove(state_file)
- # Ask the user for the GPU vendor
- if "GPU_CHOICE" in os.environ:
- choice = os.environ["GPU_CHOICE"].upper()
- print_big_message(f"Selected GPU choice \"{choice}\" based on the GPU_CHOICE environment variable.")
-
- # Warn about changed meanings and handle old choices
- if choice == "B":
- print_big_message("Warning: GPU_CHOICE='B' now means 'AMD' in the new version.")
- elif choice == "C":
- print_big_message("Warning: GPU_CHOICE='C' now means 'Apple M Series' in the new version.")
- elif choice == "D":
- print_big_message("Warning: GPU_CHOICE='D' now means 'Intel Arc' in the new version.")
- else:
- choice = get_user_choice(
- "What is your GPU?",
- {
- 'A': 'NVIDIA - CUDA 12.4',
- 'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4',
- 'C': 'Apple M Series',
- 'D': 'Intel Arc (beta)',
- 'N': 'CPU mode'
- },
- )
-
- # Convert choices to GPU names for compatibility
- gpu_choice_to_name = {
- "A": "NVIDIA",
- "B": "AMD",
- "C": "APPLE",
- "D": "INTEL",
- "N": "NONE"
- }
-
- selected_gpu = gpu_choice_to_name[choice]
+ # Get GPU choice and save it to state
+ gpu_choice = get_gpu_choice()
# Write a flag to CMD_FLAGS.txt for CPU mode
- if selected_gpu == "NONE":
+ if gpu_choice == "NONE":
cmd_flags_path = os.path.join(script_dir, "user_data", "CMD_FLAGS.txt")
with open(cmd_flags_path, 'r+') as cmd_flags_file:
if "--cpu" not in cmd_flags_file.read():
@@ -300,34 +326,20 @@ def install_webui():
cmd_flags_file.write("\n--cpu\n")
# Handle CUDA version display
- elif any((is_windows(), is_linux())) and selected_gpu == "NVIDIA":
+ elif any((is_windows(), is_linux())) and gpu_choice == "NVIDIA":
print("CUDA: 12.4")
# No PyTorch for AMD on Windows (?)
- elif is_windows() and selected_gpu == "AMD":
+ elif is_windows() and gpu_choice == "AMD":
print("PyTorch setup on Windows is not implemented yet. Exiting...")
sys.exit(1)
- # Find the Pytorch installation command
- install_pytorch = f"python -m pip install torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION} "
-
- if selected_gpu == "NVIDIA":
- install_pytorch += "--index-url https://download.pytorch.org/whl/cu124"
- elif selected_gpu == "AMD":
- install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.2.4"
- elif selected_gpu in ["APPLE", "NONE"]:
- install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
- elif selected_gpu == "INTEL":
- if is_linux():
- install_pytorch = "python -m pip install torch==2.1.0a0 torchvision==0.16.0a0 torchaudio==2.1.0a0 intel-extension-for-pytorch==2.1.10+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
- else:
- install_pytorch = "python -m pip install torch==2.1.0a0 torchvision==0.16.0a0 torchaudio==2.1.0a0 intel-extension-for-pytorch==2.1.10 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
-
# Install Git and then Pytorch
print_big_message("Installing PyTorch.")
+ install_pytorch = get_pytorch_install_command(gpu_choice)
run_cmd(f"conda install -y ninja git && {install_pytorch} && python -m pip install py-cpuinfo==9.0.0", assert_success=True, environment=True)
- if selected_gpu == "INTEL":
+ if gpu_choice == "INTEL":
# Install oneAPI dependencies via conda
print_big_message("Installing Intel oneAPI runtime libraries.")
run_cmd("conda install -y -c https://software.repos.intel.com/python/conda/ -c conda-forge dpcpp-cpp-rt=2024.0 mkl-dpcpp=2024.0", environment=True)
@@ -349,31 +361,15 @@ def update_requirements(initial_installation=False, pull=True):
assert_success=True
)
- torver = torch_version()
- requirements_base = os.path.join("requirements", "full")
-
- if "+rocm" in torver:
- file_name = f"requirements_amd{'_noavx2' if not cpu_has_avx2() else ''}.txt"
- elif "+cpu" in torver or "+cxx11" in torver:
- file_name = f"requirements_cpu_only{'_noavx2' if not cpu_has_avx2() else ''}.txt"
- elif is_macos():
- file_name = f"requirements_apple_{'intel' if is_x86_64() else 'silicon'}.txt"
- else:
- file_name = f"requirements{'_noavx2' if not cpu_has_avx2() else ''}.txt"
-
- requirements_file = os.path.join(requirements_base, file_name)
-
- # Load state from JSON file
current_commit = get_current_commit()
- wheels_changed = False
- if os.path.exists(state_file):
- with open(state_file, 'r') as f:
- last_state = json.load(f)
-
- if 'wheels_changed' in last_state or last_state.get('last_installed_commit') != current_commit:
+ wheels_changed = not os.path.exists(state_file)
+ if not wheels_changed:
+ state = load_state()
+ if 'wheels_changed' in state or state.get('last_installed_commit') != current_commit:
wheels_changed = True
- else:
- wheels_changed = True
+
+ gpu_choice = get_gpu_choice()
+ requirements_file = get_requirements_file(gpu_choice)
if pull:
# Read .whl lines before pulling
@@ -409,19 +405,17 @@ def update_requirements(initial_installation=False, pull=True):
print_big_message(f"File '{file}' was updated during 'git pull'. Please run the script again.")
# Save state before exiting
- current_state = {}
+ state = load_state()
if wheels_changed:
- current_state['wheels_changed'] = True
-
- with open(state_file, 'w') as f:
- json.dump(current_state, f)
-
+ state['wheels_changed'] = True
+ save_state(state)
sys.exit(1)
# Save current state
- current_state = {'last_installed_commit': current_commit}
- with open(state_file, 'w') as f:
- json.dump(current_state, f)
+ state = load_state()
+ state['last_installed_commit'] = current_commit
+ state.pop('wheels_changed', None) # Remove wheels_changed flag
+ save_state(state)
if os.environ.get("INSTALL_EXTENSIONS", "").lower() in ("yes", "y", "true", "1", "t", "on"):
install_extensions_requirements()
@@ -432,11 +426,10 @@ def update_requirements(initial_installation=False, pull=True):
# Update PyTorch
if not initial_installation:
update_pytorch_and_python()
- torver = torch_version()
clean_outdated_pytorch_cuda_dependencies()
print_big_message(f"Installing webui requirements from file: {requirements_file}")
- print(f"TORCH: {torver}\n")
+ print(f"GPU Choice: {gpu_choice}\n")
# Prepare the requirements file
textgen_requirements = open(requirements_file).read().splitlines()
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 0eaf10da..277f8249 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -16,6 +16,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -23,7 +24,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
-transformers==4.52.*
+transformers==4.50.*
tqdm
wandb
@@ -33,12 +34,12 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index 65f184bf..dbf35c34 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -22,7 +23,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
-transformers==4.52.*
+transformers==4.50.*
tqdm
wandb
@@ -32,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index d20b2ec3..2e5eb6c9 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -22,7 +23,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
-transformers==4.52.*
+transformers==4.50.*
tqdm
wandb
@@ -32,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index 2613d787..9a19ab29 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -22,7 +23,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
-transformers==4.52.*
+transformers==4.50.*
tqdm
wandb
@@ -32,7 +33,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3-py3-none-any.whl
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index af583b00..973d9bfb 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -22,7 +23,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
-transformers==4.52.*
+transformers==4.50.*
tqdm
wandb
@@ -32,8 +33,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3-py3-none-any.whl
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 9bf2a37d..4a48a51f 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -22,7 +23,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
-transformers==4.52.*
+transformers==4.50.*
tqdm
wandb
@@ -32,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 1731448e..76bde864 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -22,7 +23,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
-transformers==4.52.*
+transformers==4.50.*
tqdm
wandb
@@ -32,5 +33,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index fc481a1a..6cd0fa65 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -16,6 +16,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -23,7 +24,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
-transformers==4.52.*
+transformers==4.50.*
tqdm
wandb
@@ -33,12 +34,12 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 2ed8affa..a412367c 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -22,7 +23,7 @@ safetensors==0.5.*
scipy
sentencepiece
tensorboard
-transformers==4.52.*
+transformers==4.50.*
tqdm
wandb
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index fdae681d..60ce941e 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -18,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index a58f39f7..b1649bc9 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -18,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index 91ea3a6d..571eba52 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -18,6 +19,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index 37e5aa40..88170cf3 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -18,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index dcb2884b..e96cef49 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -18,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index 8f1295bb..78f94aa5 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -18,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 21805fe2..f6c866cf 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 858b4488..3e41427d 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -18,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 569bae99..022ebb61 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
+python-docx==1.1.2
pyyaml
requests
rich
@@ -18,5 +19,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.16.0/llama_cpp_binaries-0.16.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/server.py b/server.py
index c22ed1f1..99d2e171 100644
--- a/server.py
+++ b/server.py
@@ -60,6 +60,14 @@ from modules.utils import gradio
def signal_handler(sig, frame):
logger.info("Received Ctrl+C. Shutting down Text generation web UI gracefully.")
+
+ # Explicitly stop LlamaServer to avoid __del__ cleanup issues during shutdown
+ if shared.model and shared.model.__class__.__name__ == 'LlamaServer':
+ try:
+ shared.model.stop()
+ except:
+ pass
+
sys.exit(0)
diff --git a/user_data/settings-template.yaml b/user_data/settings-template.yaml
index ce0f77e1..db481e84 100644
--- a/user_data/settings-template.yaml
+++ b/user_data/settings-template.yaml
@@ -18,7 +18,6 @@ max_new_tokens_min: 1
max_new_tokens_max: 4096
prompt_lookup_num_tokens: 0
max_tokens_second: 0
-max_updates_second: 12
auto_max_new_tokens: true
ban_eos_token: false
add_bos_token: true