Merge pull request #6984 from oobabooga/dev

Merge dev branch
This commit is contained in:
oobabooga 2025-05-16 17:13:26 -03:00 committed by GitHub
commit dc3094549e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
65 changed files with 955 additions and 406 deletions

View file

@ -102,6 +102,8 @@ jobs:
shell: bash
run: |
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables
CUDA_VERSION="${{ matrix.cuda }}"

View file

@ -101,6 +101,8 @@ jobs:
shell: bash
run: |
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables
AVX_SUPPORT="${{ matrix.avx }}"

View file

@ -101,6 +101,8 @@ jobs:
shell: bash
run: |
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables
AVX_SUPPORT="${{ matrix.avx }}"

View file

@ -22,7 +22,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
- Multiple sampling parameters and generation options for sophisticated text generation control.
- Switch between different models easily in the UI without restarting, with fine control over settings.
- OpenAI-compatible API with Chat and Completions endpoints see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
- OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
- 100% offline and private, with zero telemetry, external resources, or remote update requests.
- Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
@ -44,7 +44,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases
To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again.
You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, or `update_wizard_macos.sh`.
<details>
<summary>
@ -55,12 +55,12 @@ Setup details and information about installing manually
The script uses Miniconda to set up a Conda environment in the `installer_files` folder.
If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, `cmd_macos.sh`, or `cmd_wsl.bat`.
If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, or `cmd_macos.sh`.
* There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root.
* To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts.
* For additional instructions about AMD and WSL setup, consult [the documentation](https://github.com/oobabooga/text-generation-webui/wiki).
* For automated installation, you can use the `GPU_CHOICE`, `USE_CUDA118`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
* For automated installation, you can use the `GPU_CHOICE`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
### Manual installation using Conda
@ -90,7 +90,7 @@ conda activate textgen
|--------|---------|---------|
| Linux/WSL | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
| Linux/WSL | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu` |
| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.1` |
| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.2.4` |
| MacOS + MPS | Any | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
| Windows | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
| Windows | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |

View file

@ -1,7 +1,9 @@
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 28px;
padding-bottom: 14px;
padding-top: 14px;
font-size: 18px;
font-family: Roboto, Arial, sans-serif; /* Modern font */
line-height: 1.5;
@ -102,6 +104,7 @@
@media screen and (width <= 688px) {
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 25px;
font-size: 15px;

View file

@ -2,8 +2,10 @@
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 28px;
padding-bottom: 14px;
padding-top: 14px;
font-size: 18px;
font-family: 'Noto Sans', Arial, sans-serif;
line-height: 1.428571429;
@ -100,6 +102,7 @@
@media screen and (width <= 688px) {
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 25px;
font-size: 15px;

View file

@ -16,6 +16,7 @@
}
.message {
padding-bottom: 2em;
padding-bottom: 1em;
padding-top: 1em;
grid-template-columns: 70px minmax(0, 1fr);
}

View file

@ -1,7 +1,9 @@
.message {
display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 2em;
padding-bottom: 1em;
padding-top: 1em;
font-size: 15px;
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
line-height: 22.5px !important;

View file

@ -1,5 +1,6 @@
.message {
padding-bottom: 25px;
padding-bottom: 12.5px;
padding-top: 12.5px;
font-size: 15px;
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
line-height: 1.428571429;

View file

@ -1,5 +1,6 @@
.message {
padding-bottom: 25px;
padding-bottom: 12.5px;
padding-top: 12.5px;
font-size: 15px;
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
line-height: 1.428571429;

View file

@ -8,10 +8,6 @@
padding-top: 0 !important;
}
.chat > .messages > :last-child {
margin-bottom: 1.7rem !important;
}
.chat .message-body p, .chat .message-body li {
font-size: 1rem !important;
line-height: 28px !important;
@ -46,7 +42,7 @@
}
.chat .user-message {
background: #f5f5f5;
background: #f3f4f6;
padding: 1.5rem 1rem;
padding-bottom: 2rem;
border-radius: 0;
@ -61,16 +57,16 @@
}
.dark .chat .user-message {
background: transparent;
background: var(--light-gray);
}
.dark .chat .assistant-message {
background: var(--light-gray);
background: transparent;
}
.chat .user-message .text,
.chat .assistant-message .text {
max-width: 645px;
max-width: 700px;
margin-left: auto;
margin-right: auto;
}

View file

@ -2,7 +2,7 @@
--darker-gray: #202123;
--dark-gray: #343541;
--light-gray: #444654;
--light-theme-gray: #f5f5f5;
--light-theme-gray: #f9fbff;
--border-color-dark: #525252;
--header-width: 112px;
--selected-item-color-dark: #32333e;
@ -389,7 +389,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
.chat {
margin-left: auto;
margin-right: auto;
min-height: var(--chat-height);
flex: 1;
overflow-y: auto;
display: flex;
flex-direction: column;
@ -401,10 +401,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
.chat-parent {
height: calc(100dvh - 98px - var(--input-delta));
flex: 1;
overflow: auto !important;
border-radius: 0 !important;
margin-bottom: var(--input-delta) !important;
}
.chat-parent .prose {
@ -421,13 +420,13 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
.chat-parent.bigchat {
height: calc(100dvh - 98px - var(--input-delta)) !important;
margin-bottom: var(--input-delta) !important;
flex: 1;
}
.chat > .messages {
display: flex;
flex-direction: column;
min-height: calc(100vh - 102px);
}
.chat > .messages > :first-child {
@ -546,7 +545,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
border-radius: 5px;
font-size: 82%;
padding: 1px 3px;
background: white !important;
background: #f3f4f6 !important;
color: #1f2328;
}
@ -560,18 +559,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
padding: 15px;
}
.message-body :not(pre) > code::before {
content: "`";
}
.message-body :not(pre) > code::after {
content: "`";
}
.message-body :not(pre) > code {
white-space: normal !important;
font-weight: bold;
font-family: unset;
font-size: 0.95em;
font-family: Menlo,"Roboto Mono","Courier New",Courier,monospace,Inter,sans-serif;
padding: .15rem .3rem;
background-color: #ececec;
}
.dark .message-body :not(pre) > code {
background-color: rgb(255 255 255 / 10%);
}
#chat-input {
@ -582,7 +580,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
#chat-input textarea {
background: #f3f4f6;
padding: 0.65rem 2.5rem;
border: 0;
box-shadow: 0;
}
#chat-input textarea::placeholder {
@ -603,8 +604,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
.chat-input-positioned {
position: absolute;
bottom: 0;
max-width: 54rem;
left: 50%;
transform: translateX(-50%);
@ -744,7 +743,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
.hover-menu button {
width: 100%;
background: transparent !important;
background: white !important;
border-radius: 0 !important;
justify-content: space-between;
margin: 0 !important;
@ -760,7 +759,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
.hover-menu button:hover {
background: var(--button-secondary-background-fill-hover) !important;
background: #dbeafe !important;
}
.dark .hover-menu button:hover {
background: var(--selected-item-color-dark) !important;
}
.transparent-substring {
@ -789,6 +792,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
#chat-input-container {
display: flex;
flex-direction: column;
min-width: 0 !important;
}
@ -798,9 +803,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
#chat-input-row {
padding-bottom: 1.5em;
padding-left: 1rem;
padding-right: 1rem;
padding: 1rem;
padding-top: 0;
}
#chat-input-row.bigchat {
@ -808,22 +812,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
#chat-col {
padding-bottom: 100px;
height: 100dvh;
display: flex;
flex-direction: column;
padding-bottom: 0;
gap: 0;
}
@media screen and (width <= 924px) {
#chat-col {
padding-bottom: 100px;
margin-top: 32px;
position: relative; /* Ensure positioning for the pseudo-element */
}
.chat-parent {
height: calc(100dvh - 98px - var(--input-delta) - 32px);
}
.chat-parent.bigchat {
height: calc(100dvh - 98px - var(--input-delta) - 32px) !important;
height: calc(100dvh - 32px);
}
}
@ -985,6 +984,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
cursor: pointer;
}
#past-chats .selected,
#past-chats label:hover {
background-color: #dbeafe !important;
}
#past-chats-buttons,
#delete-chat-row,
#rename-row {
@ -993,7 +997,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
gap: 9px;
}
#past-chats-row,
#chat-controls {
width: 260px;
@ -1111,12 +1114,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
color: #9ca3af;
}
.dark .hover-menu {
background-color: var(--darker-gray);
}
.dark .hover-menu button {
border-color: var(--border-color-primary);
background-color: var(--darker-gray) !important;
}
.dark #chat-controls,
@ -1125,8 +1125,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
border: 0 !important;
}
.dark #past-chats .selected,
.dark #past-chats label:hover {
.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats .selected,
.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats label:hover {
background-color: var(--selected-item-color-dark) !important;
}
@ -1163,7 +1163,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
}
.header_bar button.selected {
background: #E0E0E0;
background: #dbeafe;
}
#chat-controls,
@ -1382,3 +1382,19 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
50% { opacity: 1; }
100% { opacity: 0.6; }
}
strong {
font-weight: bold;
}
.min.svelte-1ybaih5 {
min-height: 0;
}
#vram-info .value {
color: #008d00;
}
.dark #vram-info .value {
color: #07ff07;
}

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972}
APP_UID: ${APP_UID:-6972}
env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports:

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972}
APP_UID: ${APP_UID:-6972}
env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports:

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972}
APP_UID: ${APP_UID:-6972}
env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports:

View file

@ -14,7 +14,7 @@ WORKDIR /home/app/
RUN git clone https://github.com/oobabooga/text-generation-webui.git
WORKDIR /home/app/text-generation-webui
RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
COPY CMD_FLAGS.txt /home/app/text-generation-webui/
COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
WORKDIR /home/app/text-generation-webui
# set umask to ensure group read / write at runtime

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972}
APP_UID: ${APP_UID:-6972}
env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports:
@ -31,17 +31,7 @@ services:
stdin_open: true
tty: true
volumes:
- ./cache:/home/app/text-generation-webui/cache
- ./characters:/home/app/text-generation-webui/characters
- ./extensions:/home/app/text-generation-webui/extensions
- ./loras:/home/app/text-generation-webui/loras
- ./logs:/home/app/text-generation-webui/logs
- ./models:/home/app/text-generation-webui/models
- ./presets:/home/app/text-generation-webui/presets
- ./prompts:/home/app/text-generation-webui/prompts
- ./softprompts:/home/app/text-generation-webui/softprompts
- ./training:/home/app/text-generation-webui/training
- ./cloudflared:/etc/cloudflared
- ./user_data:/home/app/text-generation-webui/user_data
deploy:
resources:
reservations:

View file

@ -257,6 +257,85 @@ headers = {
in any of the examples above.
#### Tool/Function Calling Example
You need to use a model with tools support. The prompt will be automatically formatted using the model's Jinja2 template.
Request:
```
curl http://127.0.0.1:5000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What time is it currently in New York City?"
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_time",
"description": "Get current time in a specific timezones",
"parameters": {
"type": "object",
"required": ["timezone"],
"properties": {
"timezone": {
"type": "string",
"description": "IANA timezone name (e.g., America/New_York, Europe/London). Use Europe/Berlin as local timezone if no timezone provided by the user."
}
}
}
}
}
]
}'
```
Sample response:
```
{
"id": "chatcmpl-1746532051477984256",
"object": "chat.completion",
"created": 1746532051,
"model": "qwen2.5-coder-14b-instruct-q4_k_m.gguf",
"choices": [
{
"index": 0,
"finish_reason": "tool_calls",
"message": {
"role": "assistant",
"content": "```xml\n<function>\n{\n \"name\": \"get_current_time\",\n \"arguments\": {\n \"timezone\": \"America/New_York\"\n }\n}\n</function>\n```"
},
"tool_calls": [
{
"type": "function",
"function": {
"name": "get_current_time",
"arguments": "{\"timezone\": \"America/New_York\"}"
},
"id": "call_52ij07mh",
"index": "0"
}
]
}
],
"usage": {
"prompt_tokens": 224,
"completion_tokens": 38,
"total_tokens": 262
}
}
```
### Environment variables
The following environment variables can be used (they take precedence over everything else):

View file

@ -1,16 +1,14 @@
import base64
import copy
import re
import json
import time
from collections import deque
from io import BytesIO
import requests
import tiktoken
from PIL import Image
from pydantic import ValidationError
from extensions.openai.errors import InvalidRequestError
from extensions.openai.utils import debug_msg
from extensions.openai.typing import ToolDefinition
from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
from modules import shared
from modules.chat import (
generate_chat_prompt,
@ -96,72 +94,32 @@ def convert_history(history):
user_input_last = True
system_message = ""
# Multimodal: convert OpenAI format to multimodal extension format
if any('content' in entry and isinstance(entry['content'], list) for entry in history):
new_history = []
for entry in history:
if isinstance(entry['content'], list):
for item in entry['content']:
if not isinstance(item, dict):
continue
image_url = None
content = None
if item['type'] == 'image_url' and isinstance(item['image_url'], dict):
image_url = item['image_url']['url']
elif item['type'] == 'text' and isinstance(item['text'], str):
content = item['text']
if image_url:
new_history.append({"image_url": image_url, "role": "user"})
if content:
new_history.append({"content": content, "role": "user"})
else:
new_history.append(entry)
history = new_history
for entry in history:
if "image_url" in entry:
image_url = entry['image_url']
if "base64" in image_url:
image_url = re.sub('^data:image/.+;base64,', '', image_url)
img = Image.open(BytesIO(base64.b64decode(image_url)))
else:
try:
my_res = requests.get(image_url)
img = Image.open(BytesIO(my_res.content))
except Exception:
raise 'Image cannot be loaded from the URL!'
buffered = BytesIO()
if img.mode in ("RGBA", "P"):
img = img.convert("RGB")
img.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
content = f'<img src="data:image/jpeg;base64,{img_str}">'
else:
content = entry["content"]
content = entry["content"]
role = entry["role"]
if role == "user":
user_input = content
user_input_last = True
if current_message:
chat_dialogue.append([current_message, ''])
chat_dialogue.append([current_message, '', ''])
current_message = ""
current_message = content
elif role == "assistant":
if "tool_calls" in entry and isinstance(entry["tool_calls"], list) and len(entry["tool_calls"]) > 0 and content.strip() == "":
continue # skip tool calls
current_reply = content
user_input_last = False
if current_message:
chat_dialogue.append([current_message, current_reply])
chat_dialogue.append([current_message, current_reply, ''])
current_message = ""
current_reply = ""
else:
chat_dialogue.append(['', current_reply])
chat_dialogue.append(['', current_reply, ''])
elif role == "tool":
user_input_last = False
chat_dialogue.append(['', '', content])
elif role == "system":
system_message += f"\n{content}" if system_message else content
@ -181,6 +139,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
if 'messages' not in body:
raise InvalidRequestError(message="messages is required", param='messages')
tools = None
if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and len(body['tools']) > 0:
tools = validateTools(body['tools']) # raises InvalidRequestError if validation fails
messages = body['messages']
for m in messages:
if 'role' not in m:
@ -238,6 +200,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
'custom_system_message': custom_system_message,
'chat_template_str': chat_template_str,
'chat-instruct_command': chat_instruct_command,
'tools': tools,
'history': history,
'stream': stream
})
@ -250,7 +213,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
requested_model = generate_params.pop('model')
logprob_proc = generate_params.pop('logprob_proc', None)
def chat_streaming_chunk(content):
def chat_streaming_chunk(content, chunk_tool_calls=None):
# begin streaming
chunk = {
"id": cmpl_id,
@ -260,7 +223,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
resp_list: [{
"index": 0,
"finish_reason": None,
"delta": {'role': 'assistant', 'content': content},
"delta": {'role': 'assistant', 'content': content, 'tool_calls': chunk_tool_calls},
}],
}
@ -269,6 +232,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
# else:
# chunk[resp_list][0]["logprobs"] = None
return chunk
# generate reply #######################################
@ -277,8 +241,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
yield {'prompt': prompt}
return
debug_msg({'prompt': prompt, 'generate_params': generate_params})
if stream:
yield chat_streaming_chunk('')
@ -288,8 +250,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
answer = ''
seen_content = ''
tool_calls = []
end_last_tool_call = 0
supported_tools = [x["function"]["name"] for x in tools] if tools is not None else None
for a in generator:
answer = a['internal'][-1][1]
if supported_tools is not None:
tool_call = parseToolCall(answer[end_last_tool_call:], supported_tools) if len(answer) > 0 else []
if len(tool_call) > 0:
for tc in tool_call:
tc["id"] = getToolCallId()
tc["index"] = str(len(tool_calls))
tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"])
tool_calls.append(tc)
end_last_tool_call = len(answer)
if stream:
len_seen = len(seen_content)
new_content = answer[len_seen:]
@ -297,18 +274,25 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet.
continue
seen_content = answer
chunk = chat_streaming_chunk(new_content)
seen_content = answer
yield chunk
# stop generation if tool_calls were generated previously
if len(tool_calls) > 0:
break
token_count = len(encode(prompt)[0])
completion_token_count = len(encode(answer)[0])
stop_reason = "stop"
if len(tool_calls) > 0:
stop_reason = "tool_calls"
if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']:
stop_reason = "length"
if stream:
chunk = chat_streaming_chunk('')
chunk = chat_streaming_chunk('', tool_calls)
chunk[resp_list][0]['finish_reason'] = stop_reason
chunk['usage'] = {
"prompt_tokens": token_count,
@ -326,7 +310,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
resp_list: [{
"index": 0,
"finish_reason": stop_reason,
"message": {"role": "assistant", "content": answer}
"message": {"role": "assistant", "content": answer},
"tool_calls": tool_calls
}],
"usage": {
"prompt_tokens": token_count,
@ -515,3 +500,19 @@ def completions(body: dict, is_legacy: bool = False) -> dict:
def stream_completions(body: dict, is_legacy: bool = False):
for resp in completions_common(body, is_legacy, stream=True):
yield resp
def validateTools(tools: list[dict]):
# Validate each tool definition in the JSON array
valid_tools = None
for idx in range(len(tools)):
tool = tools[idx]
try:
tool_definition = ToolDefinition(**tool)
if valid_tools is None:
valid_tools = []
valid_tools.append(tool)
except ValidationError:
raise InvalidRequestError(message=f"Invalid tool specification at index {idx}.", param='tools')
return valid_tools

View file

@ -14,6 +14,7 @@ from fastapi.requests import Request
from fastapi.responses import JSONResponse
from pydub import AudioSegment
from sse_starlette import EventSourceResponse
from starlette.concurrency import iterate_in_threadpool
import extensions.openai.completions as OAIcompletions
import extensions.openai.images as OAIimages
@ -115,7 +116,7 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
async def generator():
async with streaming_semaphore:
response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
for resp in response:
async for resp in iterate_in_threadpool(response):
disconnected = await request.is_disconnected()
if disconnected:
break
@ -125,7 +126,12 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
return EventSourceResponse(generator()) # SSE streaming
else:
response = OAIcompletions.completions(to_dict(request_data), is_legacy=is_legacy)
response = await asyncio.to_thread(
OAIcompletions.completions,
to_dict(request_data),
is_legacy=is_legacy
)
return JSONResponse(response)
@ -138,7 +144,7 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
async def generator():
async with streaming_semaphore:
response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
for resp in response:
async for resp in iterate_in_threadpool(response):
disconnected = await request.is_disconnected()
if disconnected:
break
@ -148,7 +154,12 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
return EventSourceResponse(generator()) # SSE streaming
else:
response = OAIcompletions.chat_completions(to_dict(request_data), is_legacy=is_legacy)
response = await asyncio.to_thread(
OAIcompletions.chat_completions,
to_dict(request_data),
is_legacy=is_legacy
)
return JSONResponse(response)
@ -436,7 +447,7 @@ def run_server():
# Start server
logging.getLogger("uvicorn.error").propagate = False
uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False)
def setup():

View file

@ -1,8 +1,8 @@
import json
import time
from typing import Dict, List
from typing import Dict, List, Optional
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, validator
class GenerationOptions(BaseModel):
@ -54,6 +54,48 @@ class GenerationOptions(BaseModel):
grammar_string: str = ""
class ToolDefinition(BaseModel):
function: 'ToolFunction'
type: str
class ToolFunction(BaseModel):
description: str
name: str
parameters: 'ToolParameters'
class ToolParameters(BaseModel):
properties: Optional[Dict[str, 'ToolProperty']] = None
required: Optional[list[str]] = None
type: str
description: Optional[str] = None
class ToolProperty(BaseModel):
description: Optional[str] = None
type: Optional[str] = None # we are faced with definitions like anyOf, e.g. {'type': 'function', 'function': {'name': 'git_create_branch', 'description': 'Creates a new branch from an optional base branch', 'parameters': {'type': 'object', 'properties': {'repo_path': {'title': 'Repo Path', 'type': 'string'}, 'branch_name': {'title': 'Branch Name', 'type': 'string'}, 'base_branch': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'default': None, 'title': 'Base Branch'}}, 'required': ['repo_path', 'branch_name'], 'title': 'GitCreateBranch'}}}
class FunctionCall(BaseModel):
name: str
arguments: Optional[str] = None
parameters: Optional[str] = None
@validator('arguments', allow_reuse=True)
def checkPropertyArgsOrParams(cls, v, values, **kwargs):
if not v and not values.get('parameters'):
raise ValueError("At least one of 'arguments' or 'parameters' must be provided as property in FunctionCall type")
return v
class ToolCall(BaseModel):
id: str
index: int
type: str
function: FunctionCall
class CompletionRequestParams(BaseModel):
model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.")
prompt: str | List[str]
@ -92,6 +134,7 @@ class ChatCompletionRequestParams(BaseModel):
frequency_penalty: float | None = 0
function_call: str | dict | None = Field(default=None, description="Unused parameter.")
functions: List[dict] | None = Field(default=None, description="Unused parameter.")
tools: List[dict] | None = Field(default=None, description="Tools signatures passed via MCP.")
logit_bias: dict | None = None
max_tokens: int | None = None
n: int | None = Field(default=1, description="Unused parameter.")

View file

@ -1,5 +1,8 @@
import base64
import json
import os
import random
import re
import time
import traceback
from typing import Callable, Optional
@ -52,3 +55,94 @@ def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_star
time.sleep(3)
raise Exception('Could not start cloudflared.')
def getToolCallId() -> str:
letter_bytes = "abcdefghijklmnopqrstuvwxyz0123456789"
b = [random.choice(letter_bytes) for _ in range(8)]
return "call_" + "".join(b).lower()
def checkAndSanitizeToolCallCandidate(candidate_dict: dict, tool_names: list[str]):
# check if property 'function' exists and is a dictionary, otherwise adapt dict
if 'function' not in candidate_dict and 'name' in candidate_dict and isinstance(candidate_dict['name'], str):
candidate_dict = {"type": "function", "function": candidate_dict}
if 'function' in candidate_dict and isinstance(candidate_dict['function'], str):
candidate_dict['name'] = candidate_dict['function']
del candidate_dict['function']
candidate_dict = {"type": "function", "function": candidate_dict}
if 'function' in candidate_dict and isinstance(candidate_dict['function'], dict):
# check if 'name' exists within 'function' and is part of known tools
if 'name' in candidate_dict['function'] and candidate_dict['function']['name'] in tool_names:
candidate_dict["type"] = "function" # ensure required property 'type' exists and has the right value
# map property 'parameters' used by some older models to 'arguments'
if "arguments" not in candidate_dict["function"] and "parameters" in candidate_dict["function"]:
candidate_dict["function"]["arguments"] = candidate_dict["function"]["parameters"]
del candidate_dict["function"]["parameters"]
return candidate_dict
return None
def parseToolCall(answer: str, tool_names: list[str]):
matches = []
# abort on very short answers to save computation cycles
if len(answer) < 10:
return matches
# Define the regex pattern to find the JSON content wrapped in <function>, <tools>, <tool_call>, and other tags observed from various models
patterns = [r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)</\1>"]
for pattern in patterns:
for match in re.finditer(pattern, answer, re.DOTALL):
# print(match.group(2))
if match.group(2) is None:
continue
# remove backtick wraps if present
candidate = re.sub(r"^```(json|xml|python[^\n]*)\n", "", match.group(2).strip())
candidate = re.sub(r"```$", "", candidate.strip())
# unwrap inner tags
candidate = re.sub(pattern, r"\2", candidate.strip(), flags=re.DOTALL)
# llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
if re.search(r"\}\s*\n\s*\{", candidate) is not None:
candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
if not candidate.strip().startswith("["):
candidate = "[" + candidate + "]"
candidates = []
try:
# parse the candidate JSON into a dictionary
candidates = json.loads(candidate)
if not isinstance(candidates, list):
candidates = [candidates]
except json.JSONDecodeError:
# Ignore invalid JSON silently
continue
for candidate_dict in candidates:
checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
if checked_candidate is not None:
matches.append(checked_candidate)
# last resort if nothing has been mapped: LLM might have produced plain json tool call without xml-like tags
if len(matches) == 0:
try:
candidate = answer
# llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
if re.search(r"\}\s*\n\s*\{", candidate) is not None:
candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
if not candidate.strip().startswith("["):
candidate = "[" + candidate + "]"
# parse the candidate JSON into a dictionary
candidates = json.loads(candidate)
if not isinstance(candidates, list):
candidates = [candidates]
for candidate_dict in candidates:
checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
if checked_candidate is not None:
matches.append(checked_candidate)
except json.JSONDecodeError:
# Ignore invalid JSON silently
pass
return matches

View file

@ -1,10 +1,11 @@
import math
import random
import threading
import torch
import chromadb
import numpy as np
import posthog
import torch
from chromadb.config import Settings
from chromadb.utils import embedding_functions
@ -292,6 +293,8 @@ class ChromaCollector():
for doc in documents:
doc_tokens = encode(doc)[0]
if isinstance(doc_tokens, np.ndarray):
doc_tokens = doc_tokens.tolist()
doc_token_count = len(doc_tokens)
if current_token_count + doc_token_count > max_token_count:
# If adding this document would exceed the max token count,

View file

@ -150,6 +150,16 @@ const observer = new MutationObserver(function(mutations) {
if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) {
targetElement.scrollTop = targetElement.scrollHeight;
}
const chatElement = document.getElementById("chat");
if (chatElement) {
const messagesContainer = chatElement.querySelector(".messages");
const lastChild = messagesContainer?.lastElementChild;
const prevSibling = lastChild?.previousElementSibling;
if (lastChild && prevSibling) {
lastChild.style.minHeight = `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px))`;
}
}
});
// Configure the observer to watch for changes in the subtree and attributes
@ -442,12 +452,6 @@ function updateCssProperties() {
// Check if the chat container is visible
if (chatContainer.clientHeight > 0) {
const chatContainerParentHeight = chatContainer.parentNode.clientHeight;
const newChatHeight = `${chatContainerParentHeight - chatInputHeight - 80}px`;
document.documentElement.style.setProperty("--chat-height", newChatHeight);
document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`);
// Adjust scrollTop based on input height change
if (chatInputHeight !== currentChatInputHeight) {
const deltaHeight = chatInputHeight - currentChatInputHeight;

View file

@ -5,6 +5,7 @@ import html
import json
import pprint
import re
import time
from datetime import datetime
from functools import partial
from pathlib import Path
@ -145,7 +146,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
instruct_renderer = partial(
instruction_template.render,
builtin_tools=None,
tools=None,
tools=state['tools'] if 'tools' in state else None,
tools_in_user_message=False,
add_generation_prompt=False
)
@ -171,9 +172,13 @@ def generate_chat_prompt(user_input, state, **kwargs):
messages.append({"role": "system", "content": context})
insert_pos = len(messages)
for user_msg, assistant_msg in reversed(history):
user_msg = user_msg.strip()
assistant_msg = assistant_msg.strip()
for entry in reversed(history):
user_msg = entry[0].strip()
assistant_msg = entry[1].strip()
tool_msg = entry[2].strip() if len(entry) > 2 else ''
if tool_msg:
messages.insert(insert_pos, {"role": "tool", "content": tool_msg})
if assistant_msg:
messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
@ -394,16 +399,13 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
# Extract the reply
if state['mode'] in ['chat', 'chat-instruct']:
visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply + '')
visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
else:
visible_reply = reply + ''
visible_reply = reply
visible_reply = html.escape(visible_reply)
if shared.stop_everything:
if output['visible'][-1][1].endswith(''):
output['visible'][-1][1] = output['visible'][-1][1][:-1]
output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
yield output
return
@ -419,9 +421,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
if is_stream:
yield output
if output['visible'][-1][1].endswith(''):
output['visible'][-1][1] = output['visible'][-1][1][:-1]
output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
yield output
@ -481,9 +480,17 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
send_dummy_reply(state['start_with'], state)
history = state['history']
last_save_time = time.monotonic()
save_interval = 8
for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)):
yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
current_time = time.monotonic()
# Save on first iteration or if save_interval seconds have passed
if i == 0 or (current_time - last_save_time) >= save_interval:
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
last_save_time = current_time
save_history(history, state['unique_id'], state['character_menu'], state['mode'])

View file

@ -119,7 +119,7 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
reset = True
# Maximum number of tokens to process in a single forward pass
max_chunk_size = 2048
max_chunk_size = 256
# Make the forward call
if labels is None:

View file

@ -66,7 +66,7 @@ class LlamaServer:
"top_k": state["top_k"],
"top_p": state["top_p"],
"min_p": state["min_p"],
"tfs_z": state["tfs"],
"top_n_sigma": state["top_n_sigma"] if state["top_n_sigma"] > 0 else -1,
"typical_p": state["typical_p"],
"repeat_penalty": state["repetition_penalty"],
"repeat_last_n": state["repetition_penalty_range"],
@ -102,8 +102,10 @@ class LlamaServer:
penalty_found = False
for s in samplers:
if s.strip() in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]:
if s.strip() in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc"]:
filtered_samplers.append(s.strip())
elif s.strip() == "typical_p":
filtered_samplers.append("typ_p")
elif not penalty_found and s.strip() == "repetition_penalty":
filtered_samplers.append("penalties")
penalty_found = True
@ -144,8 +146,9 @@ class LlamaServer:
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
print()
# Make a direct request with streaming enabled using a context manager
with self.session.post(url, json=payload, stream=True) as response:
# Make the generation request
response = self.session.post(url, json=payload, stream=True)
try:
response.raise_for_status() # Raise an exception for HTTP errors
full_text = ""
@ -182,6 +185,8 @@ class LlamaServer:
print(f"JSON decode error: {e}")
print(f"Problematic line: {line}")
continue
finally:
response.close()
def generate(self, prompt, state):
output = ""
@ -210,14 +215,15 @@ class LlamaServer:
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
print()
response = self.session.post(url, json=payload)
result = response.json()
for retry in range(5):
response = self.session.post(url, json=payload)
result = response.json()
if "completion_probabilities" in result:
if use_samplers:
return result["completion_probabilities"][0]["top_probs"]
else:
return result["completion_probabilities"][0]["top_logprobs"]
if "completion_probabilities" in result:
if use_samplers:
return result["completion_probabilities"][0]["top_probs"]
else:
return result["completion_probabilities"][0]["top_logprobs"]
else:
raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
@ -255,9 +261,10 @@ class LlamaServer:
self.server_path,
"--model", self.model_path,
"--ctx-size", str(shared.args.ctx_size),
"--n-gpu-layers", str(shared.args.n_gpu_layers),
"--gpu-layers", str(shared.args.gpu_layers),
"--batch-size", str(shared.args.batch_size),
"--port", str(self.port),
"--no-webui",
]
if shared.args.flash_attn:
@ -278,8 +285,10 @@ class LlamaServer:
cmd.append("--no-kv-offload")
if shared.args.row_split:
cmd += ["--split-mode", "row"]
cache_type = "fp16"
if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types:
cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type]
cache_type = shared.args.cache_type
if shared.args.compress_pos_emb != 1:
cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
if shared.args.rope_freq_base > 0:
@ -316,9 +325,15 @@ class LlamaServer:
for flag_item in extra_flags.split(','):
if '=' in flag_item:
flag, value = flag_item.split('=', 1)
cmd += [f"--{flag}", value]
if len(flag) <= 3:
cmd += [f"-{flag}", value]
else:
cmd += [f"--{flag}", value]
else:
cmd.append(f"--{flag_item}")
if len(flag_item) <= 3:
cmd.append(f"-{flag_item}")
else:
cmd.append(f"--{flag_item}")
env = os.environ.copy()
if os.name == 'posix':
@ -333,6 +348,7 @@ class LlamaServer:
print(' '.join(str(item) for item in cmd[1:]))
print()
logger.info(f"Using gpu_layers={shared.args.gpu_layers} | ctx_size={shared.args.ctx_size} | cache_type={cache_type}")
# Start the server with pipes for output
self.process = subprocess.Popen(
cmd,

View file

@ -5,7 +5,7 @@ import gradio as gr
loaders_and_params = OrderedDict({
'llama.cpp': [
'n_gpu_layers',
'gpu_layers',
'threads',
'threads_batch',
'batch_size',
@ -28,6 +28,7 @@ loaders_and_params = OrderedDict({
'device_draft',
'ctx_size_draft',
'speculative_decoding_accordion',
'vram_info',
],
'Transformers': [
'gpu_split',
@ -84,7 +85,6 @@ loaders_and_params = OrderedDict({
'no_flash_attn',
'no_xformers',
'no_sdpa',
'exllamav2_info',
'model_draft',
'draft_max',
'ctx_size_draft',
@ -299,7 +299,7 @@ loaders_samplers = {
'typical_p',
'xtc_threshold',
'xtc_probability',
'tfs',
'top_n_sigma',
'dry_multiplier',
'dry_allowed_length',
'dry_base',

View file

@ -7,6 +7,7 @@ from modules import models, shared
from modules.logging_colors import logger
from modules.models import load_model
from modules.text_generation import generate_reply
from modules.utils import check_model_loaded
global_scores = None
@ -33,9 +34,9 @@ def get_next_logits(*args, **kwargs):
def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False):
if shared.model is None:
logger.error("No model is loaded! Select one in the Model tab.")
return 'Error: No model is loaded1 Select one in the Model tab.', previous
model_is_loaded, error_message = check_model_loaded()
if not model_is_loaded:
return error_message, previous
# llama.cpp case
if shared.model.__class__.__name__ == 'LlamaServer':

View file

@ -71,7 +71,6 @@ def llama_cpp_server_loader(model_name):
else:
model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
logger.info(f"llama.cpp weights detected: \"{model_file}\"")
try:
model = LlamaServer(model_file)
return model, model

View file

@ -1,7 +1,11 @@
import functools
import json
import re
import subprocess
from math import exp
from pathlib import Path
import gradio as gr
import yaml
from modules import chat, loaders, metadata_gguf, shared, ui
@ -54,7 +58,7 @@ def get_model_metadata(model):
else:
model_file = list(path.glob('*.gguf'))[0]
metadata = metadata_gguf.load_metadata(model_file)
metadata = load_gguf_metadata_with_cache(model_file)
for k in metadata:
if k.endswith('context_length'):
@ -67,7 +71,7 @@ def get_model_metadata(model):
elif k.endswith('rope.scaling.factor'):
model_settings['compress_pos_emb'] = metadata[k]
elif k.endswith('block_count'):
model_settings['n_gpu_layers'] = metadata[k] + 1
model_settings['gpu_layers'] = metadata[k] + 1
if 'tokenizer.chat_template' in metadata:
template = metadata['tokenizer.chat_template']
@ -209,15 +213,27 @@ def apply_model_settings_to_state(model, state):
model_settings = get_model_metadata(model)
if 'loader' in model_settings:
loader = model_settings.pop('loader')
# If the user is using an alternative loader for the same model type, let them keep using it
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
state['loader'] = loader
for k in model_settings:
if k in state:
if k in state and k != 'gpu_layers': # Skip gpu_layers, handle separately
state[k] = model_settings[k]
# Handle GPU layers and VRAM update for llama.cpp
if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings:
vram_info, gpu_layers_update = update_gpu_layers_and_vram(
state['loader'],
model,
model_settings['gpu_layers'],
state['ctx_size'],
state['cache_type'],
auto_adjust=True
)
state['gpu_layers'] = gpu_layers_update
state['vram_info'] = vram_info
return state
@ -277,3 +293,186 @@ def save_instruction_template(model, template):
yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.")
else:
yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")
@functools.lru_cache(maxsize=1)
def load_gguf_metadata_with_cache(model_file):
return metadata_gguf.load_metadata(model_file)
def get_model_size_mb(model_file: Path) -> float:
filename = model_file.name
# Check for multipart pattern
match = re.match(r'(.+)-\d+-of-\d+\.gguf$', filename)
if match:
# It's a multipart file, find all matching parts
base_pattern = match.group(1)
part_files = sorted(model_file.parent.glob(f'{base_pattern}-*-of-*.gguf'))
total_size = sum(p.stat().st_size for p in part_files)
else:
# Single part
total_size = model_file.stat().st_size
return total_size / (1024 ** 2) # Return size in MB
def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
model_file = Path(f'{shared.args.model_dir}/{gguf_file}')
metadata = load_gguf_metadata_with_cache(model_file)
size_in_mb = get_model_size_mb(model_file)
# Extract values from metadata
n_layers = None
n_kv_heads = None
embedding_dim = None
context_length = None
feed_forward_dim = None
for key, value in metadata.items():
if key.endswith('.block_count'):
n_layers = value
elif key.endswith('.attention.head_count_kv'):
n_kv_heads = value
elif key.endswith('.embedding_length'):
embedding_dim = value
elif key.endswith('.context_length'):
context_length = value
elif key.endswith('.feed_forward_length'):
feed_forward_dim = value
if gpu_layers > n_layers:
gpu_layers = n_layers
# Convert cache_type to numeric
if cache_type == 'q4_0':
cache_type = 4
elif cache_type == 'q8_0':
cache_type = 8
else:
cache_type = 16
# Derived features
size_per_layer = size_in_mb / max(n_layers, 1e-6)
context_per_layer = context_length / max(n_layers, 1e-6)
ffn_per_embedding = feed_forward_dim / max(embedding_dim, 1e-6)
kv_cache_factor = n_kv_heads * cache_type * ctx_size
# Helper function for smaller
def smaller(x, y):
return 1 if x < y else 0
# Calculate VRAM using the model
# Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
vram = (
(size_per_layer - 21.19195204848197)
* exp(0.0001047328491557063 * size_in_mb * smaller(ffn_per_embedding, 2.671096993407845))
+ 0.0006621544775632052 * context_per_layer
+ 3.34664386576376e-05 * kv_cache_factor
) * (1.363306170123392 + gpu_layers) + 1255.163594536052
return vram
def get_nvidia_free_vram():
"""
Calculates the total free VRAM across all NVIDIA GPUs by parsing nvidia-smi output.
Returns:
int: The total free VRAM in MiB summed across all detected NVIDIA GPUs.
Returns -1 if nvidia-smi command fails (not found, error, etc.).
Returns 0 if nvidia-smi succeeds but no GPU memory info found.
"""
try:
# Execute nvidia-smi command
result = subprocess.run(
['nvidia-smi'],
capture_output=True,
text=True,
check=False
)
# Check if nvidia-smi returned an error
if result.returncode != 0:
return -1
# Parse the output for memory usage patterns
output = result.stdout
# Find memory usage like "XXXXMiB / YYYYMiB"
# Captures used and total memory for each GPU
matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output)
if not matches:
# No GPUs found in expected format
return 0
total_free_vram_mib = 0
for used_mem_str, total_mem_str in matches:
try:
used_mib = int(used_mem_str)
total_mib = int(total_mem_str)
total_free_vram_mib += (total_mib - used_mib)
except ValueError:
# Skip malformed entries
pass
return total_free_vram_mib
except FileNotFoundError:
# nvidia-smi not found (likely no NVIDIA drivers installed)
return -1
except Exception:
# Handle any other unexpected exceptions
return -1
def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True):
"""
Unified function to handle GPU layers and VRAM updates.
Args:
for_ui: If True, returns Gradio updates. If False, returns raw values.
Returns:
- If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update
- If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage
"""
if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"):
vram_info = "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
if for_ui:
return (vram_info, gr.update()) if auto_adjust else vram_info
else:
return (0, gpu_layers) if auto_adjust else 0
current_layers = gpu_layers
max_layers = gpu_layers
if auto_adjust:
# Get max layers from model metadata
model_settings = get_model_metadata(model)
max_layers = model_settings.get('gpu_layers', gpu_layers)
# Auto-adjust based on available VRAM
available_vram = get_nvidia_free_vram()
if available_vram > 0:
tolerance = 906
current_layers = max_layers
while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
current_layers -= 1
# Calculate VRAM with current layers
vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type)
if for_ui:
vram_info = f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{vram_usage:.0f} MiB</span>"
if auto_adjust:
return vram_info, gr.update(value=current_layers, maximum=max_layers)
else:
return vram_info
else:
if auto_adjust:
return vram_usage, current_layers
else:
return vram_usage

View file

@ -11,7 +11,7 @@ from modules.logging_colors import logger
def default_preset():
return {
result = {
'temperature': 1,
'dynatemp_low': 1,
'dynatemp_high': 1,
@ -46,10 +46,17 @@ def default_preset():
'do_sample': True,
'dynamic_temperature': False,
'temperature_last': False,
'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_n_sigma\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntop_n_sigma\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
}
if shared.args.portable:
samplers = result['sampler_priority'].split('\n')
samplers = [sampler for sampler in samplers if sampler in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc", "typical_p", "repetition_penalty"]]
result['sampler_priority'] = '\n'.join(samplers)
return result
def presets_params():
return [k for k in default_preset()]

View file

@ -47,7 +47,6 @@ settings = {
'max_new_tokens_max': 4096,
'prompt_lookup_num_tokens': 0,
'max_tokens_second': 0,
'max_updates_second': 12,
'auto_max_new_tokens': True,
'ban_eos_token': False,
'add_bos_token': True,
@ -60,7 +59,6 @@ settings = {
'custom_stopping_strings': '',
'custom_token_bans': '',
'negative_prompt': '',
'autoload_model': False,
'dark_theme': True,
'default_extensions': [],
'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}",
@ -121,7 +119,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
@ -130,9 +128,9 @@ group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to
group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
# Cache
group = parser.add_argument_group('Context and cache management')
group = parser.add_argument_group('Context and cache')
group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
# Speculative decoding
group = parser.add_argument_group('Speculative decoding')
@ -161,10 +159,6 @@ group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='B
group = parser.add_argument_group('TensorRT-LLM')
group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
# Cache
group = parser.add_argument_group('Cache')
group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
# DeepSpeed
group = parser.add_argument_group('DeepSpeed')
group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@ -190,6 +184,7 @@ group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certific
group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None)
group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy')
group.add_argument('--old-colors', action='store_true', help='Use the legacy Gradio colors, before the December/2024 update.')
group.add_argument('--portable', action='store_true', help='Hide features not available in portable mode like training.')
# API
group = parser.add_argument_group('API')
@ -311,11 +306,13 @@ if args.api or args.public_api:
add_extension('openai', last=True)
# Load model-specific settings
with Path(f'{args.model_dir}/config.yaml') as p:
if p.exists():
model_config = yaml.safe_load(open(p, 'r').read())
else:
model_config = {}
p = Path(f'{args.model_dir}/config.yaml')
if p.exists():
model_config = yaml.safe_load(open(p, 'r').read())
else:
model_config = {}
del p
# Load custom model-specific settings
user_config = load_user_config()

View file

@ -1,15 +1,15 @@
from pathlib import Path
import torch
import tensorrt_llm
import torch
from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
from modules import shared
from modules.logging_colors import logger
from modules.text_generation import (
get_max_prompt_length,
get_reply_from_output_ids
)
from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
class TensorRTLLMModel:

View file

@ -14,6 +14,7 @@ from modules.callbacks import Iteratorize
from modules.extensions import apply_extensions
from modules.html_generator import generate_basic_html
from modules.logging_colors import logger
from modules.utils import check_model_loaded
def generate_reply(*args, **kwargs):
@ -34,8 +35,8 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
# Find the appropriate generation function
generate_func = apply_extensions('custom_generate_reply')
if generate_func is None:
if shared.model_name == 'None' or shared.model is None:
logger.error("No model is loaded! Select one in the Model tab.")
model_is_loaded, error_message = check_model_loaded()
if not model_is_loaded:
yield ''
return
@ -64,41 +65,39 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
all_stop_strings += st
shared.stop_everything = False
last_update = -1
reply = ''
is_stream = state['stream']
if len(all_stop_strings) > 0 and not state['stream']:
state = copy.deepcopy(state)
state['stream'] = True
min_update_interval = 0
if state.get('max_updates_second', 0) > 0:
min_update_interval = 1 / state['max_updates_second']
# Generate
last_update = -1
latency_threshold = 1 / 1000
for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat):
cur_time = time.monotonic()
reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
if escape_html:
reply = html.escape(reply)
if is_stream:
cur_time = time.time()
# Limit number of tokens/second to make text readable in real time
if state['max_tokens_second'] > 0:
diff = 1 / state['max_tokens_second'] - (cur_time - last_update)
if diff > 0:
time.sleep(diff)
last_update = time.time()
last_update = time.monotonic()
yield reply
# Limit updates to avoid lag in the Gradio UI
# API updates are not limited
else:
if cur_time - last_update > min_update_interval:
last_update = cur_time
# If 'generate_func' takes less than 0.001 seconds to yield the next token
# (equivalent to more than 1000 tok/s), assume that the UI is lagging behind and skip yielding
if (cur_time - last_update) > latency_threshold:
yield reply
last_update = time.monotonic()
if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything):
break
@ -471,7 +470,7 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
t1 = time.time()
original_tokens = len(original_input_ids[0])
new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
return
@ -480,7 +479,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
For models that do not use the transformers library for sampling
"""
seed = set_manual_seed(state['seed'])
state['seed'] = set_manual_seed(state['seed'])
t0 = time.time()
reply = ''
try:
@ -500,7 +499,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
t1 = time.time()
original_tokens = len(encode(original_question)[0])
new_tokens = len(encode(original_question + reply)[0]) - original_tokens
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {state["seed"]})')
return

View file

@ -61,7 +61,7 @@ if not shared.args.old_colors:
background_fill_primary_dark='var(--darker-gray)',
body_background_fill="white",
block_background_fill="transparent",
body_text_color="#333",
body_text_color='rgb(64, 64, 64)',
button_secondary_background_fill="#f4f4f4",
button_secondary_border_color="var(--border-color-primary)",
@ -105,7 +105,7 @@ def list_model_elements():
'filter_by_loader',
'loader',
'cpu_memory',
'n_gpu_layers',
'gpu_layers',
'threads',
'threads_batch',
'batch_size',
@ -192,7 +192,6 @@ def list_interface_input_elements():
'max_new_tokens',
'prompt_lookup_num_tokens',
'max_tokens_second',
'max_updates_second',
'do_sample',
'dynamic_temperature',
'temperature_last',

View file

@ -46,8 +46,8 @@ def create_ui():
with gr.Row():
with gr.Column(elem_id='chat-col'):
shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
shared.gradio['display'] = gr.JSON(value={}, visible=False) # Hidden buffer
shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
with gr.Row(elem_id="chat-input-row"):
with gr.Column(scale=1, elem_id='gr-hover-container'):
gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')

View file

@ -14,6 +14,7 @@ from modules.models_settings import (
get_model_metadata,
save_instruction_template,
save_model_settings,
update_gpu_layers_and_vram,
update_model_parameters
)
from modules.utils import gradio
@ -26,71 +27,36 @@ def create_ui():
with gr.Row():
with gr.Column():
with gr.Row():
with gr.Column():
with gr.Row():
shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button', interactive=not mu)
shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu)
shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
with gr.Column():
with gr.Row():
shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
with gr.Row():
with gr.Column():
shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None)
shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None)
with gr.Blocks():
gr.Markdown("## Main options")
with gr.Row():
with gr.Column():
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
with gr.Column():
shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
# Speculative decoding
@ -99,15 +65,50 @@ def create_ui():
shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')
shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding. Recommended value: 4.')
shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
with gr.Column():
with gr.Row():
shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu)
gr.Markdown("## Other options")
with gr.Accordion("See more options", open=False, elem_classes='tgw-accordion'):
with gr.Row():
with gr.Column():
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
with gr.Column():
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
if not shared.args.portable:
with gr.Row():
shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
with gr.Column():
with gr.Tab("Download"):
shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu)
shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1, interactive=not mu)
@ -132,11 +133,10 @@ def create_event_handlers():
# In this event handler, the interface state is read and updated
# with the model defaults (if any), and then the model is loaded
# unless "autoload_model" is unchecked
shared.gradio['model_menu'].change(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then(
load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=True).success(
partial(load_model_wrapper, autoload=False), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
shared.gradio['load_model'].click(
@ -145,15 +145,31 @@ def create_event_handlers():
partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False)
shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False).then(
partial(update_gpu_layers_and_vram, auto_adjust=True), gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info', 'gpu_layers'), show_progress=False)
shared.gradio['save_model_settings'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
# For ctx_size and cache_type - auto-adjust GPU layers
for param in ['ctx_size', 'cache_type']:
shared.gradio[param].change(
partial(update_gpu_layers_and_vram, auto_adjust=True),
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
gradio('vram_info', 'gpu_layers'), show_progress=False)
# For manual gpu_layers changes - only update VRAM
shared.gradio['gpu_layers'].change(
partial(update_gpu_layers_and_vram, auto_adjust=False),
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
gradio('vram_info'), show_progress=False)
if not shared.args.portable:
shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))
shared.gradio['customized_template_submit'].click(save_instruction_template, gradio('model_menu', 'customized_template'), gradio('model_status'), show_progress=True)
@ -192,6 +208,26 @@ def load_lora_wrapper(selected_loras):
def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):
try:
# Handle direct GGUF URLs
if repo_id.startswith("https://") and ("huggingface.co" in repo_id) and (repo_id.endswith(".gguf") or repo_id.endswith(".gguf?download=true")):
try:
path = repo_id.split("huggingface.co/")[1]
# Extract the repository ID (first two parts of the path)
parts = path.split("/")
if len(parts) >= 2:
extracted_repo_id = f"{parts[0]}/{parts[1]}"
# Extract the filename (last part of the path)
filename = repo_id.split("/")[-1]
if "?download=true" in filename:
filename = filename.replace("?download=true", "")
repo_id = extracted_repo_id
specific_file = filename
except:
pass
if repo_id == "":
yield ("Please enter a model path")
return
@ -205,6 +241,18 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
yield ("Getting the download links from Hugging Face")
links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file)
# Check for multiple GGUF files
gguf_files = [link for link in links if link.lower().endswith('.gguf')]
if len(gguf_files) > 1 and not specific_file:
output = "Multiple GGUF files found. Please copy one of the following filenames to the 'File name' field:\n\n```\n"
for link in gguf_files:
output += f"{Path(link).name}\n"
output += "```"
yield output
return
if return_links:
output = "```\n"
for link in links:
@ -252,10 +300,34 @@ def update_truncation_length(current_length, state):
return current_length
def get_initial_vram_info():
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
return update_gpu_layers_and_vram(
shared.args.loader,
shared.model_name,
shared.args.gpu_layers,
shared.args.ctx_size,
shared.args.cache_type,
auto_adjust=False,
for_ui=True
)
return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
def get_initial_gpu_layers_max():
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
model_settings = get_model_metadata(shared.model_name)
return model_settings.get('gpu_layers', 256)
return 256
def handle_load_model_event_initial(model, state):
state = apply_model_settings_to_state(model, state)
output = ui.apply_interface_values(state)
update_model_parameters(state)
update_model_parameters(state) # This updates the command-line flags
return output + [state]

View file

@ -21,7 +21,7 @@ def create_ui(default_preset):
shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button')
with gr.Column():
shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown')
shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()) if not shared.args.portable else ['llama.cpp'], value="All", elem_classes='slim-dropdown')
with gr.Row():
with gr.Column():
@ -71,8 +71,6 @@ def create_ui(default_preset):
shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label='max_new_tokens', info='⚠️ Setting this too high can cause prompt truncation.')
shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')
with gr.Column():
with gr.Row():
with gr.Column():
@ -82,7 +80,7 @@ def create_ui(default_preset):
shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='enable_thinking', info='Used by Qwen3 to toggle <think> mode.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Qwen3 to toggle <think> mode.')
shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')

View file

@ -23,11 +23,15 @@ def create_ui():
shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table')
with gr.Column():
extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️ WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu)
extension_status = gr.Markdown()
if not shared.args.portable:
extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️ WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu)
extension_status = gr.Markdown()
else:
pass
shared.gradio['theme_state'] = gr.Textbox(visible=False, value='dark' if shared.settings['dark_theme'] else 'light')
extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False)
if not shared.args.portable:
extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False)
# Reset interface event
shared.gradio['reset_interface'].click(

View file

@ -72,6 +72,20 @@ def natural_keys(text):
return [atoi(c) for c in re.split(r'(\d+)', text)]
def check_model_loaded():
if shared.model_name == 'None' or shared.model is None:
if len(get_available_models()) <= 1:
error_msg = "No model is loaded.\n\nTo get started:\n1) Place a GGUF file in your user_data/models folder\n2) Go to the Model tab and select it"
logger.error(error_msg)
return False, error_msg
else:
error_msg = "No model is loaded. Please select one in the Model tab."
logger.error(error_msg)
return False, error_msg
return True, None
def get_available_models():
# Get all GGUF files
gguf_files = get_available_ggufs()

View file

@ -126,7 +126,7 @@ def check_env():
sys.exit(1)
# Ensure this is a new environment and not the base environment
if os.environ["CONDA_DEFAULT_ENV"] == "base":
if os.environ.get("CONDA_DEFAULT_ENV", "") == "base":
print("Create an environment for this project and activate it. Exiting...")
sys.exit(1)
@ -222,7 +222,7 @@ def update_pytorch_and_python():
if "+cu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
elif "+rocm" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4"
elif "+cpu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
elif "+cxx11" in torver:
@ -273,7 +273,7 @@ def install_webui():
"What is your GPU?",
{
'A': 'NVIDIA - CUDA 12.4',
'B': 'AMD - Linux/macOS only, requires ROCm 6.1',
'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4',
'C': 'Apple M Series',
'D': 'Intel Arc (beta)',
'N': 'CPU mode'
@ -314,7 +314,7 @@ def install_webui():
if selected_gpu == "NVIDIA":
install_pytorch += "--index-url https://download.pytorch.org/whl/cu124"
elif selected_gpu == "AMD":
install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.1"
install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.2.4"
elif selected_gpu in ["APPLE", "NONE"]:
install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
elif selected_gpu == "INTEL":

View file

@ -30,12 +30,12 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -29,6 +29,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -29,6 +29,7 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -29,7 +29,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -29,8 +29,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -29,5 +29,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -29,5 +29,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -30,12 +30,12 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,18 +0,0 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,18 +0,0 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"

View file

@ -15,6 +15,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -51,6 +51,7 @@ from modules.models import load_model, unload_model_if_idle
from modules.models_settings import (
get_fallback_settings,
get_model_metadata,
update_gpu_layers_and_vram,
update_model_parameters
)
from modules.shared import do_cmd_flags_warnings
@ -90,7 +91,7 @@ def create_interface():
'instruction_template_str': shared.settings['instruction_template_str'],
'prompt_menu-default': shared.settings['prompt-default'],
'prompt_menu-notebook': shared.settings['prompt-notebook'],
'filter_by_loader': shared.args.loader or 'All'
'filter_by_loader': (shared.args.loader or 'All') if not shared.args.portable else 'llama.cpp'
})
if Path("user_data/cache/pfp_character.png").exists():
@ -127,7 +128,8 @@ def create_interface():
ui_parameters.create_ui(shared.settings['preset']) # Parameters tab
ui_model_menu.create_ui() # Model tab
training.create_ui() # Training tab
if not shared.args.portable:
training.create_ui() # Training tab
ui_session.create_ui() # Session tab
# Generation events
@ -247,6 +249,20 @@ if __name__ == "__main__":
model_settings = get_model_metadata(model_name)
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
# Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
vram_usage, adjusted_layers = update_gpu_layers_and_vram(
shared.args.loader,
model_name,
model_settings['gpu_layers'],
shared.args.ctx_size,
shared.args.cache_type,
auto_adjust=True,
for_ui=False
)
shared.args.gpu_layers = adjusted_layers
# Load the model
shared.model, shared.tokenizer = load_model(model_name)
if shared.args.lora:

View file

@ -1,10 +1,15 @@
#!/usr/bin/env bash
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
cd "$(dirname "${BASH_SOURCE[0]}")"
# Portable install case
if [ -d "portable_env" ]; then
./portable_env/bin/python3 server.py --api --auto-launch "$@"
./portable_env/bin/python3 server.py --portable --api --auto-launch "$@"
exit $?
fi
@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
exit
fi
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
export CUDA_PATH="$INSTALL_ENV_DIR"
export CUDA_HOME="$CUDA_PATH"

View file

@ -1,10 +1,15 @@
#!/bin/bash
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
cd "$(dirname "${BASH_SOURCE[0]}")"
# Portable install case
if [ -d "portable_env" ]; then
./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@"
./portable_env/bin/python3 server.py --portable --api --auto-launch --api-port 5005 "$@"
exit $?
fi
@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
exit
fi
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
export CUDA_PATH="$INSTALL_ENV_DIR"
export CUDA_HOME="$CUDA_PATH"

View file

@ -1,11 +1,16 @@
@echo off
setlocal enabledelayedexpansion
@rem environment isolation
set PYTHONNOUSERSITE=1
set PYTHONPATH=
set PYTHONHOME=
cd /D "%~dp0"
@rem Portable install case
if exist "portable_env" (
.\portable_env\python.exe server.py --api --auto-launch %*
.\portable_env\python.exe server.py --portable --api --auto-launch %*
exit /b %errorlevel%
)
@ -87,10 +92,6 @@ if not exist "%INSTALL_ENV_DIR%" (
@rem check if conda environment was actually created
if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
@rem environment isolation
set PYTHONNOUSERSITE=1
set PYTHONPATH=
set PYTHONHOME=
set "CUDA_PATH=%INSTALL_ENV_DIR%"
set "CUDA_HOME=%CUDA_PATH%"

View file

@ -18,7 +18,6 @@ max_new_tokens_min: 1
max_new_tokens_max: 4096
prompt_lookup_num_tokens: 0
max_tokens_second: 0
max_updates_second: 12
auto_max_new_tokens: true
ban_eos_token: false
add_bos_token: true
@ -31,7 +30,6 @@ seed: -1
custom_stopping_strings: ''
custom_token_bans: ''
negative_prompt: ''
autoload_model: false
dark_theme: true
default_extensions: []
instruction_template_str: |-