Merge pull request #6984 from oobabooga/dev

Merge dev branch
This commit is contained in:
oobabooga 2025-05-16 17:13:26 -03:00 committed by GitHub
commit dc3094549e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
65 changed files with 955 additions and 406 deletions

View file

@ -102,6 +102,8 @@ jobs:
shell: bash shell: bash
run: | run: |
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables # Define common variables
CUDA_VERSION="${{ matrix.cuda }}" CUDA_VERSION="${{ matrix.cuda }}"

View file

@ -101,6 +101,8 @@ jobs:
shell: bash shell: bash
run: | run: |
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables # Define common variables
AVX_SUPPORT="${{ matrix.avx }}" AVX_SUPPORT="${{ matrix.avx }}"

View file

@ -101,6 +101,8 @@ jobs:
shell: bash shell: bash
run: | run: |
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
# Define common variables # Define common variables
AVX_SUPPORT="${{ matrix.avx }}" AVX_SUPPORT="${{ matrix.avx }}"

View file

@ -22,7 +22,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these. - Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
- Multiple sampling parameters and generation options for sophisticated text generation control. - Multiple sampling parameters and generation options for sophisticated text generation control.
- Switch between different models easily in the UI without restarting, with fine control over settings. - Switch between different models easily in the UI without restarting, with fine control over settings.
- OpenAI-compatible API with Chat and Completions endpoints see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples). - OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
- 100% offline and private, with zero telemetry, external resources, or remote update requests. - 100% offline and private, with zero telemetry, external resources, or remote update requests.
- Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details. - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
@ -44,7 +44,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases
To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again. To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again.
You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`. You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, or `update_wizard_macos.sh`.
<details> <details>
<summary> <summary>
@ -55,12 +55,12 @@ Setup details and information about installing manually
The script uses Miniconda to set up a Conda environment in the `installer_files` folder. The script uses Miniconda to set up a Conda environment in the `installer_files` folder.
If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, `cmd_macos.sh`, or `cmd_wsl.bat`. If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, or `cmd_macos.sh`.
* There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root. * There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root.
* To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts. * To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts.
* For additional instructions about AMD and WSL setup, consult [the documentation](https://github.com/oobabooga/text-generation-webui/wiki). * For additional instructions about AMD and WSL setup, consult [the documentation](https://github.com/oobabooga/text-generation-webui/wiki).
* For automated installation, you can use the `GPU_CHOICE`, `USE_CUDA118`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`. * For automated installation, you can use the `GPU_CHOICE`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
### Manual installation using Conda ### Manual installation using Conda
@ -90,7 +90,7 @@ conda activate textgen
|--------|---------|---------| |--------|---------|---------|
| Linux/WSL | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` | | Linux/WSL | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
| Linux/WSL | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu` | | Linux/WSL | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu` |
| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.1` | | Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.2.4` |
| MacOS + MPS | Any | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` | | MacOS + MPS | Any | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
| Windows | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` | | Windows | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
| Windows | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` | | Windows | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |

View file

@ -1,7 +1,9 @@
.message { .message {
display: grid; display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr); grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 28px; padding-bottom: 14px;
padding-top: 14px;
font-size: 18px; font-size: 18px;
font-family: Roboto, Arial, sans-serif; /* Modern font */ font-family: Roboto, Arial, sans-serif; /* Modern font */
line-height: 1.5; line-height: 1.5;
@ -102,6 +104,7 @@
@media screen and (width <= 688px) { @media screen and (width <= 688px) {
.message { .message {
display: grid; display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr); grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 25px; padding-bottom: 25px;
font-size: 15px; font-size: 15px;

View file

@ -2,8 +2,10 @@
.message { .message {
display: grid; display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr); grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 28px; padding-bottom: 14px;
padding-top: 14px;
font-size: 18px; font-size: 18px;
font-family: 'Noto Sans', Arial, sans-serif; font-family: 'Noto Sans', Arial, sans-serif;
line-height: 1.428571429; line-height: 1.428571429;
@ -100,6 +102,7 @@
@media screen and (width <= 688px) { @media screen and (width <= 688px) {
.message { .message {
display: grid; display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr); grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 25px; padding-bottom: 25px;
font-size: 15px; font-size: 15px;

View file

@ -16,6 +16,7 @@
} }
.message { .message {
padding-bottom: 2em; padding-bottom: 1em;
padding-top: 1em;
grid-template-columns: 70px minmax(0, 1fr); grid-template-columns: 70px minmax(0, 1fr);
} }

View file

@ -1,7 +1,9 @@
.message { .message {
display: grid; display: grid;
align-items: start;
grid-template-columns: 60px minmax(0, 1fr); grid-template-columns: 60px minmax(0, 1fr);
padding-bottom: 2em; padding-bottom: 1em;
padding-top: 1em;
font-size: 15px; font-size: 15px;
font-family: 'Noto Sans', Helvetica, Arial, sans-serif; font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
line-height: 22.5px !important; line-height: 22.5px !important;

View file

@ -1,5 +1,6 @@
.message { .message {
padding-bottom: 25px; padding-bottom: 12.5px;
padding-top: 12.5px;
font-size: 15px; font-size: 15px;
font-family: 'Noto Sans', Helvetica, Arial, sans-serif; font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
line-height: 1.428571429; line-height: 1.428571429;

View file

@ -1,5 +1,6 @@
.message { .message {
padding-bottom: 25px; padding-bottom: 12.5px;
padding-top: 12.5px;
font-size: 15px; font-size: 15px;
font-family: 'Noto Sans', Helvetica, Arial, sans-serif; font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
line-height: 1.428571429; line-height: 1.428571429;

View file

@ -8,10 +8,6 @@
padding-top: 0 !important; padding-top: 0 !important;
} }
.chat > .messages > :last-child {
margin-bottom: 1.7rem !important;
}
.chat .message-body p, .chat .message-body li { .chat .message-body p, .chat .message-body li {
font-size: 1rem !important; font-size: 1rem !important;
line-height: 28px !important; line-height: 28px !important;
@ -46,7 +42,7 @@
} }
.chat .user-message { .chat .user-message {
background: #f5f5f5; background: #f3f4f6;
padding: 1.5rem 1rem; padding: 1.5rem 1rem;
padding-bottom: 2rem; padding-bottom: 2rem;
border-radius: 0; border-radius: 0;
@ -61,16 +57,16 @@
} }
.dark .chat .user-message { .dark .chat .user-message {
background: transparent; background: var(--light-gray);
} }
.dark .chat .assistant-message { .dark .chat .assistant-message {
background: var(--light-gray); background: transparent;
} }
.chat .user-message .text, .chat .user-message .text,
.chat .assistant-message .text { .chat .assistant-message .text {
max-width: 645px; max-width: 700px;
margin-left: auto; margin-left: auto;
margin-right: auto; margin-right: auto;
} }

View file

@ -2,7 +2,7 @@
--darker-gray: #202123; --darker-gray: #202123;
--dark-gray: #343541; --dark-gray: #343541;
--light-gray: #444654; --light-gray: #444654;
--light-theme-gray: #f5f5f5; --light-theme-gray: #f9fbff;
--border-color-dark: #525252; --border-color-dark: #525252;
--header-width: 112px; --header-width: 112px;
--selected-item-color-dark: #32333e; --selected-item-color-dark: #32333e;
@ -389,7 +389,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
.chat { .chat {
margin-left: auto; margin-left: auto;
margin-right: auto; margin-right: auto;
min-height: var(--chat-height); flex: 1;
overflow-y: auto; overflow-y: auto;
display: flex; display: flex;
flex-direction: column; flex-direction: column;
@ -401,10 +401,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
} }
.chat-parent { .chat-parent {
height: calc(100dvh - 98px - var(--input-delta)); flex: 1;
overflow: auto !important; overflow: auto !important;
border-radius: 0 !important; border-radius: 0 !important;
margin-bottom: var(--input-delta) !important;
} }
.chat-parent .prose { .chat-parent .prose {
@ -421,13 +420,13 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
} }
.chat-parent.bigchat { .chat-parent.bigchat {
height: calc(100dvh - 98px - var(--input-delta)) !important; flex: 1;
margin-bottom: var(--input-delta) !important;
} }
.chat > .messages { .chat > .messages {
display: flex; display: flex;
flex-direction: column; flex-direction: column;
min-height: calc(100vh - 102px);
} }
.chat > .messages > :first-child { .chat > .messages > :first-child {
@ -546,7 +545,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
border-radius: 5px; border-radius: 5px;
font-size: 82%; font-size: 82%;
padding: 1px 3px; padding: 1px 3px;
background: white !important; background: #f3f4f6 !important;
color: #1f2328; color: #1f2328;
} }
@ -560,18 +559,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
padding: 15px; padding: 15px;
} }
.message-body :not(pre) > code::before {
content: "`";
}
.message-body :not(pre) > code::after {
content: "`";
}
.message-body :not(pre) > code { .message-body :not(pre) > code {
white-space: normal !important; white-space: normal !important;
font-weight: bold; font-weight: bold;
font-family: unset; font-size: 0.95em;
font-family: Menlo,"Roboto Mono","Courier New",Courier,monospace,Inter,sans-serif;
padding: .15rem .3rem;
background-color: #ececec;
}
.dark .message-body :not(pre) > code {
background-color: rgb(255 255 255 / 10%);
} }
#chat-input { #chat-input {
@ -582,7 +580,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
} }
#chat-input textarea { #chat-input textarea {
background: #f3f4f6;
padding: 0.65rem 2.5rem; padding: 0.65rem 2.5rem;
border: 0;
box-shadow: 0;
} }
#chat-input textarea::placeholder { #chat-input textarea::placeholder {
@ -603,8 +604,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
} }
.chat-input-positioned { .chat-input-positioned {
position: absolute;
bottom: 0;
max-width: 54rem; max-width: 54rem;
left: 50%; left: 50%;
transform: translateX(-50%); transform: translateX(-50%);
@ -744,7 +743,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
.hover-menu button { .hover-menu button {
width: 100%; width: 100%;
background: transparent !important; background: white !important;
border-radius: 0 !important; border-radius: 0 !important;
justify-content: space-between; justify-content: space-between;
margin: 0 !important; margin: 0 !important;
@ -760,7 +759,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
} }
.hover-menu button:hover { .hover-menu button:hover {
background: var(--button-secondary-background-fill-hover) !important; background: #dbeafe !important;
}
.dark .hover-menu button:hover {
background: var(--selected-item-color-dark) !important;
} }
.transparent-substring { .transparent-substring {
@ -789,6 +792,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
} }
#chat-input-container { #chat-input-container {
display: flex;
flex-direction: column;
min-width: 0 !important; min-width: 0 !important;
} }
@ -798,9 +803,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
} }
#chat-input-row { #chat-input-row {
padding-bottom: 1.5em; padding: 1rem;
padding-left: 1rem; padding-top: 0;
padding-right: 1rem;
} }
#chat-input-row.bigchat { #chat-input-row.bigchat {
@ -808,22 +812,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
} }
#chat-col { #chat-col {
padding-bottom: 100px; height: 100dvh;
display: flex;
flex-direction: column;
padding-bottom: 0;
gap: 0;
} }
@media screen and (width <= 924px) { @media screen and (width <= 924px) {
#chat-col { #chat-col {
padding-bottom: 100px;
margin-top: 32px; margin-top: 32px;
position: relative; /* Ensure positioning for the pseudo-element */ height: calc(100dvh - 32px);
}
.chat-parent {
height: calc(100dvh - 98px - var(--input-delta) - 32px);
}
.chat-parent.bigchat {
height: calc(100dvh - 98px - var(--input-delta) - 32px) !important;
} }
} }
@ -985,6 +984,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
cursor: pointer; cursor: pointer;
} }
#past-chats .selected,
#past-chats label:hover {
background-color: #dbeafe !important;
}
#past-chats-buttons, #past-chats-buttons,
#delete-chat-row, #delete-chat-row,
#rename-row { #rename-row {
@ -993,7 +997,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
gap: 9px; gap: 9px;
} }
#past-chats-row, #past-chats-row,
#chat-controls { #chat-controls {
width: 260px; width: 260px;
@ -1111,12 +1114,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
color: #9ca3af; color: #9ca3af;
} }
.dark .hover-menu {
background-color: var(--darker-gray);
}
.dark .hover-menu button { .dark .hover-menu button {
border-color: var(--border-color-primary); border-color: var(--border-color-primary);
background-color: var(--darker-gray) !important;
} }
.dark #chat-controls, .dark #chat-controls,
@ -1125,8 +1125,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
border: 0 !important; border: 0 !important;
} }
.dark #past-chats .selected, .dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats .selected,
.dark #past-chats label:hover { .dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats label:hover {
background-color: var(--selected-item-color-dark) !important; background-color: var(--selected-item-color-dark) !important;
} }
@ -1163,7 +1163,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
} }
.header_bar button.selected { .header_bar button.selected {
background: #E0E0E0; background: #dbeafe;
} }
#chat-controls, #chat-controls,
@ -1382,3 +1382,19 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
50% { opacity: 1; } 50% { opacity: 1; }
100% { opacity: 0.6; } 100% { opacity: 0.6; }
} }
strong {
font-weight: bold;
}
.min.svelte-1ybaih5 {
min-height: 0;
}
#vram-info .value {
color: #008d00;
}
.dark #vram-info .value {
color: #07ff07;
}

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-} BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972} APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972} APP_UID: ${APP_UID:-6972}
env_file: .env env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}" user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports: ports:

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-} BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972} APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972} APP_UID: ${APP_UID:-6972}
env_file: .env env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}" user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports: ports:

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-} BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972} APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972} APP_UID: ${APP_UID:-6972}
env_file: .env env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}" user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports: ports:

View file

@ -14,7 +14,7 @@ WORKDIR /home/app/
RUN git clone https://github.com/oobabooga/text-generation-webui.git RUN git clone https://github.com/oobabooga/text-generation-webui.git
WORKDIR /home/app/text-generation-webui WORKDIR /home/app/text-generation-webui
RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
COPY CMD_FLAGS.txt /home/app/text-generation-webui/ COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005} EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
WORKDIR /home/app/text-generation-webui WORKDIR /home/app/text-generation-webui
# set umask to ensure group read / write at runtime # set umask to ensure group read / write at runtime

View file

@ -22,7 +22,7 @@ services:
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-} BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
APP_GID: ${APP_GID:-6972} APP_GID: ${APP_GID:-6972}
APP_UID: ${APP_UID-6972} APP_UID: ${APP_UID:-6972}
env_file: .env env_file: .env
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}" user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
ports: ports:
@ -31,17 +31,7 @@ services:
stdin_open: true stdin_open: true
tty: true tty: true
volumes: volumes:
- ./cache:/home/app/text-generation-webui/cache - ./user_data:/home/app/text-generation-webui/user_data
- ./characters:/home/app/text-generation-webui/characters
- ./extensions:/home/app/text-generation-webui/extensions
- ./loras:/home/app/text-generation-webui/loras
- ./logs:/home/app/text-generation-webui/logs
- ./models:/home/app/text-generation-webui/models
- ./presets:/home/app/text-generation-webui/presets
- ./prompts:/home/app/text-generation-webui/prompts
- ./softprompts:/home/app/text-generation-webui/softprompts
- ./training:/home/app/text-generation-webui/training
- ./cloudflared:/etc/cloudflared
deploy: deploy:
resources: resources:
reservations: reservations:

View file

@ -257,6 +257,85 @@ headers = {
in any of the examples above. in any of the examples above.
#### Tool/Function Calling Example
You need to use a model with tools support. The prompt will be automatically formatted using the model's Jinja2 template.
Request:
```
curl http://127.0.0.1:5000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What time is it currently in New York City?"
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_time",
"description": "Get current time in a specific timezones",
"parameters": {
"type": "object",
"required": ["timezone"],
"properties": {
"timezone": {
"type": "string",
"description": "IANA timezone name (e.g., America/New_York, Europe/London). Use Europe/Berlin as local timezone if no timezone provided by the user."
}
}
}
}
}
]
}'
```
Sample response:
```
{
"id": "chatcmpl-1746532051477984256",
"object": "chat.completion",
"created": 1746532051,
"model": "qwen2.5-coder-14b-instruct-q4_k_m.gguf",
"choices": [
{
"index": 0,
"finish_reason": "tool_calls",
"message": {
"role": "assistant",
"content": "```xml\n<function>\n{\n \"name\": \"get_current_time\",\n \"arguments\": {\n \"timezone\": \"America/New_York\"\n }\n}\n</function>\n```"
},
"tool_calls": [
{
"type": "function",
"function": {
"name": "get_current_time",
"arguments": "{\"timezone\": \"America/New_York\"}"
},
"id": "call_52ij07mh",
"index": "0"
}
]
}
],
"usage": {
"prompt_tokens": 224,
"completion_tokens": 38,
"total_tokens": 262
}
}
```
### Environment variables ### Environment variables
The following environment variables can be used (they take precedence over everything else): The following environment variables can be used (they take precedence over everything else):

View file

@ -1,16 +1,14 @@
import base64
import copy import copy
import re import json
import time import time
from collections import deque from collections import deque
from io import BytesIO
import requests
import tiktoken import tiktoken
from PIL import Image from pydantic import ValidationError
from extensions.openai.errors import InvalidRequestError from extensions.openai.errors import InvalidRequestError
from extensions.openai.utils import debug_msg from extensions.openai.typing import ToolDefinition
from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
from modules import shared from modules import shared
from modules.chat import ( from modules.chat import (
generate_chat_prompt, generate_chat_prompt,
@ -96,72 +94,32 @@ def convert_history(history):
user_input_last = True user_input_last = True
system_message = "" system_message = ""
# Multimodal: convert OpenAI format to multimodal extension format
if any('content' in entry and isinstance(entry['content'], list) for entry in history):
new_history = []
for entry in history:
if isinstance(entry['content'], list):
for item in entry['content']:
if not isinstance(item, dict):
continue
image_url = None
content = None
if item['type'] == 'image_url' and isinstance(item['image_url'], dict):
image_url = item['image_url']['url']
elif item['type'] == 'text' and isinstance(item['text'], str):
content = item['text']
if image_url:
new_history.append({"image_url": image_url, "role": "user"})
if content:
new_history.append({"content": content, "role": "user"})
else:
new_history.append(entry)
history = new_history
for entry in history: for entry in history:
if "image_url" in entry: content = entry["content"]
image_url = entry['image_url']
if "base64" in image_url:
image_url = re.sub('^data:image/.+;base64,', '', image_url)
img = Image.open(BytesIO(base64.b64decode(image_url)))
else:
try:
my_res = requests.get(image_url)
img = Image.open(BytesIO(my_res.content))
except Exception:
raise 'Image cannot be loaded from the URL!'
buffered = BytesIO()
if img.mode in ("RGBA", "P"):
img = img.convert("RGB")
img.save(buffered, format="JPEG")
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
content = f'<img src="data:image/jpeg;base64,{img_str}">'
else:
content = entry["content"]
role = entry["role"] role = entry["role"]
if role == "user": if role == "user":
user_input = content user_input = content
user_input_last = True user_input_last = True
if current_message: if current_message:
chat_dialogue.append([current_message, '']) chat_dialogue.append([current_message, '', ''])
current_message = "" current_message = ""
current_message = content current_message = content
elif role == "assistant": elif role == "assistant":
if "tool_calls" in entry and isinstance(entry["tool_calls"], list) and len(entry["tool_calls"]) > 0 and content.strip() == "":
continue # skip tool calls
current_reply = content current_reply = content
user_input_last = False user_input_last = False
if current_message: if current_message:
chat_dialogue.append([current_message, current_reply]) chat_dialogue.append([current_message, current_reply, ''])
current_message = "" current_message = ""
current_reply = "" current_reply = ""
else: else:
chat_dialogue.append(['', current_reply]) chat_dialogue.append(['', current_reply, ''])
elif role == "tool":
user_input_last = False
chat_dialogue.append(['', '', content])
elif role == "system": elif role == "system":
system_message += f"\n{content}" if system_message else content system_message += f"\n{content}" if system_message else content
@ -181,6 +139,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
if 'messages' not in body: if 'messages' not in body:
raise InvalidRequestError(message="messages is required", param='messages') raise InvalidRequestError(message="messages is required", param='messages')
tools = None
if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and len(body['tools']) > 0:
tools = validateTools(body['tools']) # raises InvalidRequestError if validation fails
messages = body['messages'] messages = body['messages']
for m in messages: for m in messages:
if 'role' not in m: if 'role' not in m:
@ -238,6 +200,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
'custom_system_message': custom_system_message, 'custom_system_message': custom_system_message,
'chat_template_str': chat_template_str, 'chat_template_str': chat_template_str,
'chat-instruct_command': chat_instruct_command, 'chat-instruct_command': chat_instruct_command,
'tools': tools,
'history': history, 'history': history,
'stream': stream 'stream': stream
}) })
@ -250,7 +213,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
requested_model = generate_params.pop('model') requested_model = generate_params.pop('model')
logprob_proc = generate_params.pop('logprob_proc', None) logprob_proc = generate_params.pop('logprob_proc', None)
def chat_streaming_chunk(content): def chat_streaming_chunk(content, chunk_tool_calls=None):
# begin streaming # begin streaming
chunk = { chunk = {
"id": cmpl_id, "id": cmpl_id,
@ -260,7 +223,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
resp_list: [{ resp_list: [{
"index": 0, "index": 0,
"finish_reason": None, "finish_reason": None,
"delta": {'role': 'assistant', 'content': content}, "delta": {'role': 'assistant', 'content': content, 'tool_calls': chunk_tool_calls},
}], }],
} }
@ -269,6 +232,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]} chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
# else: # else:
# chunk[resp_list][0]["logprobs"] = None # chunk[resp_list][0]["logprobs"] = None
return chunk return chunk
# generate reply ####################################### # generate reply #######################################
@ -277,8 +241,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
yield {'prompt': prompt} yield {'prompt': prompt}
return return
debug_msg({'prompt': prompt, 'generate_params': generate_params})
if stream: if stream:
yield chat_streaming_chunk('') yield chat_streaming_chunk('')
@ -288,8 +250,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
answer = '' answer = ''
seen_content = '' seen_content = ''
tool_calls = []
end_last_tool_call = 0
supported_tools = [x["function"]["name"] for x in tools] if tools is not None else None
for a in generator: for a in generator:
answer = a['internal'][-1][1] answer = a['internal'][-1][1]
if supported_tools is not None:
tool_call = parseToolCall(answer[end_last_tool_call:], supported_tools) if len(answer) > 0 else []
if len(tool_call) > 0:
for tc in tool_call:
tc["id"] = getToolCallId()
tc["index"] = str(len(tool_calls))
tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"])
tool_calls.append(tc)
end_last_tool_call = len(answer)
if stream: if stream:
len_seen = len(seen_content) len_seen = len(seen_content)
new_content = answer[len_seen:] new_content = answer[len_seen:]
@ -297,18 +274,25 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet. if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet.
continue continue
seen_content = answer
chunk = chat_streaming_chunk(new_content) chunk = chat_streaming_chunk(new_content)
seen_content = answer
yield chunk yield chunk
# stop generation if tool_calls were generated previously
if len(tool_calls) > 0:
break
token_count = len(encode(prompt)[0]) token_count = len(encode(prompt)[0])
completion_token_count = len(encode(answer)[0]) completion_token_count = len(encode(answer)[0])
stop_reason = "stop" stop_reason = "stop"
if len(tool_calls) > 0:
stop_reason = "tool_calls"
if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']: if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']:
stop_reason = "length" stop_reason = "length"
if stream: if stream:
chunk = chat_streaming_chunk('') chunk = chat_streaming_chunk('', tool_calls)
chunk[resp_list][0]['finish_reason'] = stop_reason chunk[resp_list][0]['finish_reason'] = stop_reason
chunk['usage'] = { chunk['usage'] = {
"prompt_tokens": token_count, "prompt_tokens": token_count,
@ -326,7 +310,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
resp_list: [{ resp_list: [{
"index": 0, "index": 0,
"finish_reason": stop_reason, "finish_reason": stop_reason,
"message": {"role": "assistant", "content": answer} "message": {"role": "assistant", "content": answer},
"tool_calls": tool_calls
}], }],
"usage": { "usage": {
"prompt_tokens": token_count, "prompt_tokens": token_count,
@ -515,3 +500,19 @@ def completions(body: dict, is_legacy: bool = False) -> dict:
def stream_completions(body: dict, is_legacy: bool = False): def stream_completions(body: dict, is_legacy: bool = False):
for resp in completions_common(body, is_legacy, stream=True): for resp in completions_common(body, is_legacy, stream=True):
yield resp yield resp
def validateTools(tools: list[dict]):
# Validate each tool definition in the JSON array
valid_tools = None
for idx in range(len(tools)):
tool = tools[idx]
try:
tool_definition = ToolDefinition(**tool)
if valid_tools is None:
valid_tools = []
valid_tools.append(tool)
except ValidationError:
raise InvalidRequestError(message=f"Invalid tool specification at index {idx}.", param='tools')
return valid_tools

View file

@ -14,6 +14,7 @@ from fastapi.requests import Request
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from pydub import AudioSegment from pydub import AudioSegment
from sse_starlette import EventSourceResponse from sse_starlette import EventSourceResponse
from starlette.concurrency import iterate_in_threadpool
import extensions.openai.completions as OAIcompletions import extensions.openai.completions as OAIcompletions
import extensions.openai.images as OAIimages import extensions.openai.images as OAIimages
@ -115,7 +116,7 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
async def generator(): async def generator():
async with streaming_semaphore: async with streaming_semaphore:
response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy) response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
for resp in response: async for resp in iterate_in_threadpool(response):
disconnected = await request.is_disconnected() disconnected = await request.is_disconnected()
if disconnected: if disconnected:
break break
@ -125,7 +126,12 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
return EventSourceResponse(generator()) # SSE streaming return EventSourceResponse(generator()) # SSE streaming
else: else:
response = OAIcompletions.completions(to_dict(request_data), is_legacy=is_legacy) response = await asyncio.to_thread(
OAIcompletions.completions,
to_dict(request_data),
is_legacy=is_legacy
)
return JSONResponse(response) return JSONResponse(response)
@ -138,7 +144,7 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
async def generator(): async def generator():
async with streaming_semaphore: async with streaming_semaphore:
response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy) response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
for resp in response: async for resp in iterate_in_threadpool(response):
disconnected = await request.is_disconnected() disconnected = await request.is_disconnected()
if disconnected: if disconnected:
break break
@ -148,7 +154,12 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
return EventSourceResponse(generator()) # SSE streaming return EventSourceResponse(generator()) # SSE streaming
else: else:
response = OAIcompletions.chat_completions(to_dict(request_data), is_legacy=is_legacy) response = await asyncio.to_thread(
OAIcompletions.chat_completions,
to_dict(request_data),
is_legacy=is_legacy
)
return JSONResponse(response) return JSONResponse(response)
@ -436,7 +447,7 @@ def run_server():
# Start server # Start server
logging.getLogger("uvicorn.error").propagate = False logging.getLogger("uvicorn.error").propagate = False
uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile) uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False)
def setup(): def setup():

View file

@ -1,8 +1,8 @@
import json import json
import time import time
from typing import Dict, List from typing import Dict, List, Optional
from pydantic import BaseModel, Field from pydantic import BaseModel, Field, validator
class GenerationOptions(BaseModel): class GenerationOptions(BaseModel):
@ -54,6 +54,48 @@ class GenerationOptions(BaseModel):
grammar_string: str = "" grammar_string: str = ""
class ToolDefinition(BaseModel):
function: 'ToolFunction'
type: str
class ToolFunction(BaseModel):
description: str
name: str
parameters: 'ToolParameters'
class ToolParameters(BaseModel):
properties: Optional[Dict[str, 'ToolProperty']] = None
required: Optional[list[str]] = None
type: str
description: Optional[str] = None
class ToolProperty(BaseModel):
description: Optional[str] = None
type: Optional[str] = None # we are faced with definitions like anyOf, e.g. {'type': 'function', 'function': {'name': 'git_create_branch', 'description': 'Creates a new branch from an optional base branch', 'parameters': {'type': 'object', 'properties': {'repo_path': {'title': 'Repo Path', 'type': 'string'}, 'branch_name': {'title': 'Branch Name', 'type': 'string'}, 'base_branch': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'default': None, 'title': 'Base Branch'}}, 'required': ['repo_path', 'branch_name'], 'title': 'GitCreateBranch'}}}
class FunctionCall(BaseModel):
name: str
arguments: Optional[str] = None
parameters: Optional[str] = None
@validator('arguments', allow_reuse=True)
def checkPropertyArgsOrParams(cls, v, values, **kwargs):
if not v and not values.get('parameters'):
raise ValueError("At least one of 'arguments' or 'parameters' must be provided as property in FunctionCall type")
return v
class ToolCall(BaseModel):
id: str
index: int
type: str
function: FunctionCall
class CompletionRequestParams(BaseModel): class CompletionRequestParams(BaseModel):
model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.") model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.")
prompt: str | List[str] prompt: str | List[str]
@ -92,6 +134,7 @@ class ChatCompletionRequestParams(BaseModel):
frequency_penalty: float | None = 0 frequency_penalty: float | None = 0
function_call: str | dict | None = Field(default=None, description="Unused parameter.") function_call: str | dict | None = Field(default=None, description="Unused parameter.")
functions: List[dict] | None = Field(default=None, description="Unused parameter.") functions: List[dict] | None = Field(default=None, description="Unused parameter.")
tools: List[dict] | None = Field(default=None, description="Tools signatures passed via MCP.")
logit_bias: dict | None = None logit_bias: dict | None = None
max_tokens: int | None = None max_tokens: int | None = None
n: int | None = Field(default=1, description="Unused parameter.") n: int | None = Field(default=1, description="Unused parameter.")

View file

@ -1,5 +1,8 @@
import base64 import base64
import json
import os import os
import random
import re
import time import time
import traceback import traceback
from typing import Callable, Optional from typing import Callable, Optional
@ -52,3 +55,94 @@ def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_star
time.sleep(3) time.sleep(3)
raise Exception('Could not start cloudflared.') raise Exception('Could not start cloudflared.')
def getToolCallId() -> str:
letter_bytes = "abcdefghijklmnopqrstuvwxyz0123456789"
b = [random.choice(letter_bytes) for _ in range(8)]
return "call_" + "".join(b).lower()
def checkAndSanitizeToolCallCandidate(candidate_dict: dict, tool_names: list[str]):
# check if property 'function' exists and is a dictionary, otherwise adapt dict
if 'function' not in candidate_dict and 'name' in candidate_dict and isinstance(candidate_dict['name'], str):
candidate_dict = {"type": "function", "function": candidate_dict}
if 'function' in candidate_dict and isinstance(candidate_dict['function'], str):
candidate_dict['name'] = candidate_dict['function']
del candidate_dict['function']
candidate_dict = {"type": "function", "function": candidate_dict}
if 'function' in candidate_dict and isinstance(candidate_dict['function'], dict):
# check if 'name' exists within 'function' and is part of known tools
if 'name' in candidate_dict['function'] and candidate_dict['function']['name'] in tool_names:
candidate_dict["type"] = "function" # ensure required property 'type' exists and has the right value
# map property 'parameters' used by some older models to 'arguments'
if "arguments" not in candidate_dict["function"] and "parameters" in candidate_dict["function"]:
candidate_dict["function"]["arguments"] = candidate_dict["function"]["parameters"]
del candidate_dict["function"]["parameters"]
return candidate_dict
return None
def parseToolCall(answer: str, tool_names: list[str]):
matches = []
# abort on very short answers to save computation cycles
if len(answer) < 10:
return matches
# Define the regex pattern to find the JSON content wrapped in <function>, <tools>, <tool_call>, and other tags observed from various models
patterns = [r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)</\1>"]
for pattern in patterns:
for match in re.finditer(pattern, answer, re.DOTALL):
# print(match.group(2))
if match.group(2) is None:
continue
# remove backtick wraps if present
candidate = re.sub(r"^```(json|xml|python[^\n]*)\n", "", match.group(2).strip())
candidate = re.sub(r"```$", "", candidate.strip())
# unwrap inner tags
candidate = re.sub(pattern, r"\2", candidate.strip(), flags=re.DOTALL)
# llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
if re.search(r"\}\s*\n\s*\{", candidate) is not None:
candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
if not candidate.strip().startswith("["):
candidate = "[" + candidate + "]"
candidates = []
try:
# parse the candidate JSON into a dictionary
candidates = json.loads(candidate)
if not isinstance(candidates, list):
candidates = [candidates]
except json.JSONDecodeError:
# Ignore invalid JSON silently
continue
for candidate_dict in candidates:
checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
if checked_candidate is not None:
matches.append(checked_candidate)
# last resort if nothing has been mapped: LLM might have produced plain json tool call without xml-like tags
if len(matches) == 0:
try:
candidate = answer
# llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
if re.search(r"\}\s*\n\s*\{", candidate) is not None:
candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
if not candidate.strip().startswith("["):
candidate = "[" + candidate + "]"
# parse the candidate JSON into a dictionary
candidates = json.loads(candidate)
if not isinstance(candidates, list):
candidates = [candidates]
for candidate_dict in candidates:
checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
if checked_candidate is not None:
matches.append(checked_candidate)
except json.JSONDecodeError:
# Ignore invalid JSON silently
pass
return matches

View file

@ -1,10 +1,11 @@
import math import math
import random import random
import threading import threading
import torch
import chromadb import chromadb
import numpy as np import numpy as np
import posthog import posthog
import torch
from chromadb.config import Settings from chromadb.config import Settings
from chromadb.utils import embedding_functions from chromadb.utils import embedding_functions
@ -292,6 +293,8 @@ class ChromaCollector():
for doc in documents: for doc in documents:
doc_tokens = encode(doc)[0] doc_tokens = encode(doc)[0]
if isinstance(doc_tokens, np.ndarray):
doc_tokens = doc_tokens.tolist()
doc_token_count = len(doc_tokens) doc_token_count = len(doc_tokens)
if current_token_count + doc_token_count > max_token_count: if current_token_count + doc_token_count > max_token_count:
# If adding this document would exceed the max token count, # If adding this document would exceed the max token count,

View file

@ -150,6 +150,16 @@ const observer = new MutationObserver(function(mutations) {
if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) { if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) {
targetElement.scrollTop = targetElement.scrollHeight; targetElement.scrollTop = targetElement.scrollHeight;
} }
const chatElement = document.getElementById("chat");
if (chatElement) {
const messagesContainer = chatElement.querySelector(".messages");
const lastChild = messagesContainer?.lastElementChild;
const prevSibling = lastChild?.previousElementSibling;
if (lastChild && prevSibling) {
lastChild.style.minHeight = `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px))`;
}
}
}); });
// Configure the observer to watch for changes in the subtree and attributes // Configure the observer to watch for changes in the subtree and attributes
@ -442,12 +452,6 @@ function updateCssProperties() {
// Check if the chat container is visible // Check if the chat container is visible
if (chatContainer.clientHeight > 0) { if (chatContainer.clientHeight > 0) {
const chatContainerParentHeight = chatContainer.parentNode.clientHeight;
const newChatHeight = `${chatContainerParentHeight - chatInputHeight - 80}px`;
document.documentElement.style.setProperty("--chat-height", newChatHeight);
document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`);
// Adjust scrollTop based on input height change // Adjust scrollTop based on input height change
if (chatInputHeight !== currentChatInputHeight) { if (chatInputHeight !== currentChatInputHeight) {
const deltaHeight = chatInputHeight - currentChatInputHeight; const deltaHeight = chatInputHeight - currentChatInputHeight;
@ -720,7 +724,7 @@ function isMobile() {
// Function to initialize sidebars // Function to initialize sidebars
function initializeSidebars() { function initializeSidebars() {
const isOnMobile = isMobile(); const isOnMobile = isMobile();
if (isOnMobile) { if (isOnMobile) {
// Mobile state: Hide sidebars and set closed states // Mobile state: Hide sidebars and set closed states
[pastChatsRow, chatControlsRow, headerBar].forEach(el => { [pastChatsRow, chatControlsRow, headerBar].forEach(el => {

View file

@ -5,6 +5,7 @@ import html
import json import json
import pprint import pprint
import re import re
import time
from datetime import datetime from datetime import datetime
from functools import partial from functools import partial
from pathlib import Path from pathlib import Path
@ -145,7 +146,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
instruct_renderer = partial( instruct_renderer = partial(
instruction_template.render, instruction_template.render,
builtin_tools=None, builtin_tools=None,
tools=None, tools=state['tools'] if 'tools' in state else None,
tools_in_user_message=False, tools_in_user_message=False,
add_generation_prompt=False add_generation_prompt=False
) )
@ -171,9 +172,13 @@ def generate_chat_prompt(user_input, state, **kwargs):
messages.append({"role": "system", "content": context}) messages.append({"role": "system", "content": context})
insert_pos = len(messages) insert_pos = len(messages)
for user_msg, assistant_msg in reversed(history): for entry in reversed(history):
user_msg = user_msg.strip() user_msg = entry[0].strip()
assistant_msg = assistant_msg.strip() assistant_msg = entry[1].strip()
tool_msg = entry[2].strip() if len(entry) > 2 else ''
if tool_msg:
messages.insert(insert_pos, {"role": "tool", "content": tool_msg})
if assistant_msg: if assistant_msg:
messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg}) messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
@ -394,16 +399,13 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
# Extract the reply # Extract the reply
if state['mode'] in ['chat', 'chat-instruct']: if state['mode'] in ['chat', 'chat-instruct']:
visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply + '') visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
else: else:
visible_reply = reply + '' visible_reply = reply
visible_reply = html.escape(visible_reply) visible_reply = html.escape(visible_reply)
if shared.stop_everything: if shared.stop_everything:
if output['visible'][-1][1].endswith(''):
output['visible'][-1][1] = output['visible'][-1][1][:-1]
output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True) output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
yield output yield output
return return
@ -419,9 +421,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
if is_stream: if is_stream:
yield output yield output
if output['visible'][-1][1].endswith(''):
output['visible'][-1][1] = output['visible'][-1][1][:-1]
output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True) output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
yield output yield output
@ -481,9 +480,17 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
send_dummy_reply(state['start_with'], state) send_dummy_reply(state['start_with'], state)
history = state['history'] history = state['history']
last_save_time = time.monotonic()
save_interval = 8
for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)): for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)):
yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
current_time = time.monotonic()
# Save on first iteration or if save_interval seconds have passed
if i == 0 or (current_time - last_save_time) >= save_interval:
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
last_save_time = current_time
save_history(history, state['unique_id'], state['character_menu'], state['mode']) save_history(history, state['unique_id'], state['character_menu'], state['mode'])

View file

@ -119,7 +119,7 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
reset = True reset = True
# Maximum number of tokens to process in a single forward pass # Maximum number of tokens to process in a single forward pass
max_chunk_size = 2048 max_chunk_size = 256
# Make the forward call # Make the forward call
if labels is None: if labels is None:

View file

@ -66,7 +66,7 @@ class LlamaServer:
"top_k": state["top_k"], "top_k": state["top_k"],
"top_p": state["top_p"], "top_p": state["top_p"],
"min_p": state["min_p"], "min_p": state["min_p"],
"tfs_z": state["tfs"], "top_n_sigma": state["top_n_sigma"] if state["top_n_sigma"] > 0 else -1,
"typical_p": state["typical_p"], "typical_p": state["typical_p"],
"repeat_penalty": state["repetition_penalty"], "repeat_penalty": state["repetition_penalty"],
"repeat_last_n": state["repetition_penalty_range"], "repeat_last_n": state["repetition_penalty_range"],
@ -102,8 +102,10 @@ class LlamaServer:
penalty_found = False penalty_found = False
for s in samplers: for s in samplers:
if s.strip() in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]: if s.strip() in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc"]:
filtered_samplers.append(s.strip()) filtered_samplers.append(s.strip())
elif s.strip() == "typical_p":
filtered_samplers.append("typ_p")
elif not penalty_found and s.strip() == "repetition_penalty": elif not penalty_found and s.strip() == "repetition_penalty":
filtered_samplers.append("penalties") filtered_samplers.append("penalties")
penalty_found = True penalty_found = True
@ -144,8 +146,9 @@ class LlamaServer:
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
print() print()
# Make a direct request with streaming enabled using a context manager # Make the generation request
with self.session.post(url, json=payload, stream=True) as response: response = self.session.post(url, json=payload, stream=True)
try:
response.raise_for_status() # Raise an exception for HTTP errors response.raise_for_status() # Raise an exception for HTTP errors
full_text = "" full_text = ""
@ -182,6 +185,8 @@ class LlamaServer:
print(f"JSON decode error: {e}") print(f"JSON decode error: {e}")
print(f"Problematic line: {line}") print(f"Problematic line: {line}")
continue continue
finally:
response.close()
def generate(self, prompt, state): def generate(self, prompt, state):
output = "" output = ""
@ -210,14 +215,15 @@ class LlamaServer:
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
print() print()
response = self.session.post(url, json=payload) for retry in range(5):
result = response.json() response = self.session.post(url, json=payload)
result = response.json()
if "completion_probabilities" in result: if "completion_probabilities" in result:
if use_samplers: if use_samplers:
return result["completion_probabilities"][0]["top_probs"] return result["completion_probabilities"][0]["top_probs"]
else: else:
return result["completion_probabilities"][0]["top_logprobs"] return result["completion_probabilities"][0]["top_logprobs"]
else: else:
raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}") raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
@ -255,9 +261,10 @@ class LlamaServer:
self.server_path, self.server_path,
"--model", self.model_path, "--model", self.model_path,
"--ctx-size", str(shared.args.ctx_size), "--ctx-size", str(shared.args.ctx_size),
"--n-gpu-layers", str(shared.args.n_gpu_layers), "--gpu-layers", str(shared.args.gpu_layers),
"--batch-size", str(shared.args.batch_size), "--batch-size", str(shared.args.batch_size),
"--port", str(self.port), "--port", str(self.port),
"--no-webui",
] ]
if shared.args.flash_attn: if shared.args.flash_attn:
@ -278,8 +285,10 @@ class LlamaServer:
cmd.append("--no-kv-offload") cmd.append("--no-kv-offload")
if shared.args.row_split: if shared.args.row_split:
cmd += ["--split-mode", "row"] cmd += ["--split-mode", "row"]
cache_type = "fp16"
if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types: if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types:
cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type] cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type]
cache_type = shared.args.cache_type
if shared.args.compress_pos_emb != 1: if shared.args.compress_pos_emb != 1:
cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)] cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
if shared.args.rope_freq_base > 0: if shared.args.rope_freq_base > 0:
@ -316,9 +325,15 @@ class LlamaServer:
for flag_item in extra_flags.split(','): for flag_item in extra_flags.split(','):
if '=' in flag_item: if '=' in flag_item:
flag, value = flag_item.split('=', 1) flag, value = flag_item.split('=', 1)
cmd += [f"--{flag}", value] if len(flag) <= 3:
cmd += [f"-{flag}", value]
else:
cmd += [f"--{flag}", value]
else: else:
cmd.append(f"--{flag_item}") if len(flag_item) <= 3:
cmd.append(f"-{flag_item}")
else:
cmd.append(f"--{flag_item}")
env = os.environ.copy() env = os.environ.copy()
if os.name == 'posix': if os.name == 'posix':
@ -333,6 +348,7 @@ class LlamaServer:
print(' '.join(str(item) for item in cmd[1:])) print(' '.join(str(item) for item in cmd[1:]))
print() print()
logger.info(f"Using gpu_layers={shared.args.gpu_layers} | ctx_size={shared.args.ctx_size} | cache_type={cache_type}")
# Start the server with pipes for output # Start the server with pipes for output
self.process = subprocess.Popen( self.process = subprocess.Popen(
cmd, cmd,

View file

@ -5,7 +5,7 @@ import gradio as gr
loaders_and_params = OrderedDict({ loaders_and_params = OrderedDict({
'llama.cpp': [ 'llama.cpp': [
'n_gpu_layers', 'gpu_layers',
'threads', 'threads',
'threads_batch', 'threads_batch',
'batch_size', 'batch_size',
@ -28,6 +28,7 @@ loaders_and_params = OrderedDict({
'device_draft', 'device_draft',
'ctx_size_draft', 'ctx_size_draft',
'speculative_decoding_accordion', 'speculative_decoding_accordion',
'vram_info',
], ],
'Transformers': [ 'Transformers': [
'gpu_split', 'gpu_split',
@ -84,7 +85,6 @@ loaders_and_params = OrderedDict({
'no_flash_attn', 'no_flash_attn',
'no_xformers', 'no_xformers',
'no_sdpa', 'no_sdpa',
'exllamav2_info',
'model_draft', 'model_draft',
'draft_max', 'draft_max',
'ctx_size_draft', 'ctx_size_draft',
@ -299,7 +299,7 @@ loaders_samplers = {
'typical_p', 'typical_p',
'xtc_threshold', 'xtc_threshold',
'xtc_probability', 'xtc_probability',
'tfs', 'top_n_sigma',
'dry_multiplier', 'dry_multiplier',
'dry_allowed_length', 'dry_allowed_length',
'dry_base', 'dry_base',

View file

@ -7,6 +7,7 @@ from modules import models, shared
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.models import load_model from modules.models import load_model
from modules.text_generation import generate_reply from modules.text_generation import generate_reply
from modules.utils import check_model_loaded
global_scores = None global_scores = None
@ -33,9 +34,9 @@ def get_next_logits(*args, **kwargs):
def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False): def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False):
if shared.model is None: model_is_loaded, error_message = check_model_loaded()
logger.error("No model is loaded! Select one in the Model tab.") if not model_is_loaded:
return 'Error: No model is loaded1 Select one in the Model tab.', previous return error_message, previous
# llama.cpp case # llama.cpp case
if shared.model.__class__.__name__ == 'LlamaServer': if shared.model.__class__.__name__ == 'LlamaServer':

View file

@ -71,7 +71,6 @@ def llama_cpp_server_loader(model_name):
else: else:
model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0] model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
logger.info(f"llama.cpp weights detected: \"{model_file}\"")
try: try:
model = LlamaServer(model_file) model = LlamaServer(model_file)
return model, model return model, model

View file

@ -1,7 +1,11 @@
import functools
import json import json
import re import re
import subprocess
from math import exp
from pathlib import Path from pathlib import Path
import gradio as gr
import yaml import yaml
from modules import chat, loaders, metadata_gguf, shared, ui from modules import chat, loaders, metadata_gguf, shared, ui
@ -54,7 +58,7 @@ def get_model_metadata(model):
else: else:
model_file = list(path.glob('*.gguf'))[0] model_file = list(path.glob('*.gguf'))[0]
metadata = metadata_gguf.load_metadata(model_file) metadata = load_gguf_metadata_with_cache(model_file)
for k in metadata: for k in metadata:
if k.endswith('context_length'): if k.endswith('context_length'):
@ -67,7 +71,7 @@ def get_model_metadata(model):
elif k.endswith('rope.scaling.factor'): elif k.endswith('rope.scaling.factor'):
model_settings['compress_pos_emb'] = metadata[k] model_settings['compress_pos_emb'] = metadata[k]
elif k.endswith('block_count'): elif k.endswith('block_count'):
model_settings['n_gpu_layers'] = metadata[k] + 1 model_settings['gpu_layers'] = metadata[k] + 1
if 'tokenizer.chat_template' in metadata: if 'tokenizer.chat_template' in metadata:
template = metadata['tokenizer.chat_template'] template = metadata['tokenizer.chat_template']
@ -209,15 +213,27 @@ def apply_model_settings_to_state(model, state):
model_settings = get_model_metadata(model) model_settings = get_model_metadata(model)
if 'loader' in model_settings: if 'loader' in model_settings:
loader = model_settings.pop('loader') loader = model_settings.pop('loader')
# If the user is using an alternative loader for the same model type, let them keep using it
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']): if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
state['loader'] = loader state['loader'] = loader
for k in model_settings: for k in model_settings:
if k in state: if k in state and k != 'gpu_layers': # Skip gpu_layers, handle separately
state[k] = model_settings[k] state[k] = model_settings[k]
# Handle GPU layers and VRAM update for llama.cpp
if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings:
vram_info, gpu_layers_update = update_gpu_layers_and_vram(
state['loader'],
model,
model_settings['gpu_layers'],
state['ctx_size'],
state['cache_type'],
auto_adjust=True
)
state['gpu_layers'] = gpu_layers_update
state['vram_info'] = vram_info
return state return state
@ -277,3 +293,186 @@ def save_instruction_template(model, template):
yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.") yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.")
else: else:
yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.") yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")
@functools.lru_cache(maxsize=1)
def load_gguf_metadata_with_cache(model_file):
return metadata_gguf.load_metadata(model_file)
def get_model_size_mb(model_file: Path) -> float:
filename = model_file.name
# Check for multipart pattern
match = re.match(r'(.+)-\d+-of-\d+\.gguf$', filename)
if match:
# It's a multipart file, find all matching parts
base_pattern = match.group(1)
part_files = sorted(model_file.parent.glob(f'{base_pattern}-*-of-*.gguf'))
total_size = sum(p.stat().st_size for p in part_files)
else:
# Single part
total_size = model_file.stat().st_size
return total_size / (1024 ** 2) # Return size in MB
def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
model_file = Path(f'{shared.args.model_dir}/{gguf_file}')
metadata = load_gguf_metadata_with_cache(model_file)
size_in_mb = get_model_size_mb(model_file)
# Extract values from metadata
n_layers = None
n_kv_heads = None
embedding_dim = None
context_length = None
feed_forward_dim = None
for key, value in metadata.items():
if key.endswith('.block_count'):
n_layers = value
elif key.endswith('.attention.head_count_kv'):
n_kv_heads = value
elif key.endswith('.embedding_length'):
embedding_dim = value
elif key.endswith('.context_length'):
context_length = value
elif key.endswith('.feed_forward_length'):
feed_forward_dim = value
if gpu_layers > n_layers:
gpu_layers = n_layers
# Convert cache_type to numeric
if cache_type == 'q4_0':
cache_type = 4
elif cache_type == 'q8_0':
cache_type = 8
else:
cache_type = 16
# Derived features
size_per_layer = size_in_mb / max(n_layers, 1e-6)
context_per_layer = context_length / max(n_layers, 1e-6)
ffn_per_embedding = feed_forward_dim / max(embedding_dim, 1e-6)
kv_cache_factor = n_kv_heads * cache_type * ctx_size
# Helper function for smaller
def smaller(x, y):
return 1 if x < y else 0
# Calculate VRAM using the model
# Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
vram = (
(size_per_layer - 21.19195204848197)
* exp(0.0001047328491557063 * size_in_mb * smaller(ffn_per_embedding, 2.671096993407845))
+ 0.0006621544775632052 * context_per_layer
+ 3.34664386576376e-05 * kv_cache_factor
) * (1.363306170123392 + gpu_layers) + 1255.163594536052
return vram
def get_nvidia_free_vram():
"""
Calculates the total free VRAM across all NVIDIA GPUs by parsing nvidia-smi output.
Returns:
int: The total free VRAM in MiB summed across all detected NVIDIA GPUs.
Returns -1 if nvidia-smi command fails (not found, error, etc.).
Returns 0 if nvidia-smi succeeds but no GPU memory info found.
"""
try:
# Execute nvidia-smi command
result = subprocess.run(
['nvidia-smi'],
capture_output=True,
text=True,
check=False
)
# Check if nvidia-smi returned an error
if result.returncode != 0:
return -1
# Parse the output for memory usage patterns
output = result.stdout
# Find memory usage like "XXXXMiB / YYYYMiB"
# Captures used and total memory for each GPU
matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output)
if not matches:
# No GPUs found in expected format
return 0
total_free_vram_mib = 0
for used_mem_str, total_mem_str in matches:
try:
used_mib = int(used_mem_str)
total_mib = int(total_mem_str)
total_free_vram_mib += (total_mib - used_mib)
except ValueError:
# Skip malformed entries
pass
return total_free_vram_mib
except FileNotFoundError:
# nvidia-smi not found (likely no NVIDIA drivers installed)
return -1
except Exception:
# Handle any other unexpected exceptions
return -1
def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True):
"""
Unified function to handle GPU layers and VRAM updates.
Args:
for_ui: If True, returns Gradio updates. If False, returns raw values.
Returns:
- If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update
- If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage
"""
if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"):
vram_info = "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
if for_ui:
return (vram_info, gr.update()) if auto_adjust else vram_info
else:
return (0, gpu_layers) if auto_adjust else 0
current_layers = gpu_layers
max_layers = gpu_layers
if auto_adjust:
# Get max layers from model metadata
model_settings = get_model_metadata(model)
max_layers = model_settings.get('gpu_layers', gpu_layers)
# Auto-adjust based on available VRAM
available_vram = get_nvidia_free_vram()
if available_vram > 0:
tolerance = 906
current_layers = max_layers
while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
current_layers -= 1
# Calculate VRAM with current layers
vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type)
if for_ui:
vram_info = f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{vram_usage:.0f} MiB</span>"
if auto_adjust:
return vram_info, gr.update(value=current_layers, maximum=max_layers)
else:
return vram_info
else:
if auto_adjust:
return vram_usage, current_layers
else:
return vram_usage

View file

@ -11,7 +11,7 @@ from modules.logging_colors import logger
def default_preset(): def default_preset():
return { result = {
'temperature': 1, 'temperature': 1,
'dynatemp_low': 1, 'dynatemp_low': 1,
'dynatemp_high': 1, 'dynatemp_high': 1,
@ -46,10 +46,17 @@ def default_preset():
'do_sample': True, 'do_sample': True,
'dynamic_temperature': False, 'dynamic_temperature': False,
'temperature_last': False, 'temperature_last': False,
'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_n_sigma\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram', 'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntop_n_sigma\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
'dry_sequence_breakers': '"\\n", ":", "\\"", "*"', 'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
} }
if shared.args.portable:
samplers = result['sampler_priority'].split('\n')
samplers = [sampler for sampler in samplers if sampler in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc", "typical_p", "repetition_penalty"]]
result['sampler_priority'] = '\n'.join(samplers)
return result
def presets_params(): def presets_params():
return [k for k in default_preset()] return [k for k in default_preset()]

View file

@ -47,7 +47,6 @@ settings = {
'max_new_tokens_max': 4096, 'max_new_tokens_max': 4096,
'prompt_lookup_num_tokens': 0, 'prompt_lookup_num_tokens': 0,
'max_tokens_second': 0, 'max_tokens_second': 0,
'max_updates_second': 12,
'auto_max_new_tokens': True, 'auto_max_new_tokens': True,
'ban_eos_token': False, 'ban_eos_token': False,
'add_bos_token': True, 'add_bos_token': True,
@ -60,7 +59,6 @@ settings = {
'custom_stopping_strings': '', 'custom_stopping_strings': '',
'custom_token_bans': '', 'custom_token_bans': '',
'negative_prompt': '', 'negative_prompt': '',
'autoload_model': False,
'dark_theme': True, 'dark_theme': True,
'default_extensions': [], 'default_extensions': [],
'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}", 'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}",
@ -121,7 +119,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.') group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.') group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.') group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.') group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.') group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
@ -130,9 +128,9 @@ group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to
group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
# Cache # Cache
group = parser.add_argument_group('Context and cache management') group = parser.add_argument_group('Context and cache')
group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.') group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).') group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
# Speculative decoding # Speculative decoding
group = parser.add_argument_group('Speculative decoding') group = parser.add_argument_group('Speculative decoding')
@ -161,10 +159,6 @@ group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='B
group = parser.add_argument_group('TensorRT-LLM') group = parser.add_argument_group('TensorRT-LLM')
group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.') group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
# Cache
group = parser.add_argument_group('Cache')
group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
# DeepSpeed # DeepSpeed
group = parser.add_argument_group('DeepSpeed') group = parser.add_argument_group('DeepSpeed')
group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.') group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@ -190,6 +184,7 @@ group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certific
group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None) group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None)
group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy') group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy')
group.add_argument('--old-colors', action='store_true', help='Use the legacy Gradio colors, before the December/2024 update.') group.add_argument('--old-colors', action='store_true', help='Use the legacy Gradio colors, before the December/2024 update.')
group.add_argument('--portable', action='store_true', help='Hide features not available in portable mode like training.')
# API # API
group = parser.add_argument_group('API') group = parser.add_argument_group('API')
@ -311,11 +306,13 @@ if args.api or args.public_api:
add_extension('openai', last=True) add_extension('openai', last=True)
# Load model-specific settings # Load model-specific settings
with Path(f'{args.model_dir}/config.yaml') as p: p = Path(f'{args.model_dir}/config.yaml')
if p.exists(): if p.exists():
model_config = yaml.safe_load(open(p, 'r').read()) model_config = yaml.safe_load(open(p, 'r').read())
else: else:
model_config = {} model_config = {}
del p
# Load custom model-specific settings # Load custom model-specific settings
user_config = load_user_config() user_config = load_user_config()

View file

@ -1,15 +1,15 @@
from pathlib import Path from pathlib import Path
import torch
import tensorrt_llm import tensorrt_llm
import torch
from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
from modules import shared from modules import shared
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.text_generation import ( from modules.text_generation import (
get_max_prompt_length, get_max_prompt_length,
get_reply_from_output_ids get_reply_from_output_ids
) )
from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
class TensorRTLLMModel: class TensorRTLLMModel:

View file

@ -14,6 +14,7 @@ from modules.callbacks import Iteratorize
from modules.extensions import apply_extensions from modules.extensions import apply_extensions
from modules.html_generator import generate_basic_html from modules.html_generator import generate_basic_html
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.utils import check_model_loaded
def generate_reply(*args, **kwargs): def generate_reply(*args, **kwargs):
@ -34,8 +35,8 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
# Find the appropriate generation function # Find the appropriate generation function
generate_func = apply_extensions('custom_generate_reply') generate_func = apply_extensions('custom_generate_reply')
if generate_func is None: if generate_func is None:
if shared.model_name == 'None' or shared.model is None: model_is_loaded, error_message = check_model_loaded()
logger.error("No model is loaded! Select one in the Model tab.") if not model_is_loaded:
yield '' yield ''
return return
@ -64,41 +65,39 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
all_stop_strings += st all_stop_strings += st
shared.stop_everything = False shared.stop_everything = False
last_update = -1
reply = '' reply = ''
is_stream = state['stream'] is_stream = state['stream']
if len(all_stop_strings) > 0 and not state['stream']: if len(all_stop_strings) > 0 and not state['stream']:
state = copy.deepcopy(state) state = copy.deepcopy(state)
state['stream'] = True state['stream'] = True
min_update_interval = 0
if state.get('max_updates_second', 0) > 0:
min_update_interval = 1 / state['max_updates_second']
# Generate # Generate
last_update = -1
latency_threshold = 1 / 1000
for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat): for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat):
cur_time = time.monotonic()
reply, stop_found = apply_stopping_strings(reply, all_stop_strings) reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
if escape_html: if escape_html:
reply = html.escape(reply) reply = html.escape(reply)
if is_stream: if is_stream:
cur_time = time.time()
# Limit number of tokens/second to make text readable in real time # Limit number of tokens/second to make text readable in real time
if state['max_tokens_second'] > 0: if state['max_tokens_second'] > 0:
diff = 1 / state['max_tokens_second'] - (cur_time - last_update) diff = 1 / state['max_tokens_second'] - (cur_time - last_update)
if diff > 0: if diff > 0:
time.sleep(diff) time.sleep(diff)
last_update = time.time() last_update = time.monotonic()
yield reply yield reply
# Limit updates to avoid lag in the Gradio UI # Limit updates to avoid lag in the Gradio UI
# API updates are not limited # API updates are not limited
else: else:
if cur_time - last_update > min_update_interval: # If 'generate_func' takes less than 0.001 seconds to yield the next token
last_update = cur_time # (equivalent to more than 1000 tok/s), assume that the UI is lagging behind and skip yielding
if (cur_time - last_update) > latency_threshold:
yield reply yield reply
last_update = time.monotonic()
if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything): if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything):
break break
@ -471,7 +470,7 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
t1 = time.time() t1 = time.time()
original_tokens = len(original_input_ids[0]) original_tokens = len(original_input_ids[0])
new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0) new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})') logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
return return
@ -480,7 +479,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
For models that do not use the transformers library for sampling For models that do not use the transformers library for sampling
""" """
seed = set_manual_seed(state['seed']) state['seed'] = set_manual_seed(state['seed'])
t0 = time.time() t0 = time.time()
reply = '' reply = ''
try: try:
@ -500,7 +499,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
t1 = time.time() t1 = time.time()
original_tokens = len(encode(original_question)[0]) original_tokens = len(encode(original_question)[0])
new_tokens = len(encode(original_question + reply)[0]) - original_tokens new_tokens = len(encode(original_question + reply)[0]) - original_tokens
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})') logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {state["seed"]})')
return return

View file

@ -61,7 +61,7 @@ if not shared.args.old_colors:
background_fill_primary_dark='var(--darker-gray)', background_fill_primary_dark='var(--darker-gray)',
body_background_fill="white", body_background_fill="white",
block_background_fill="transparent", block_background_fill="transparent",
body_text_color="#333", body_text_color='rgb(64, 64, 64)',
button_secondary_background_fill="#f4f4f4", button_secondary_background_fill="#f4f4f4",
button_secondary_border_color="var(--border-color-primary)", button_secondary_border_color="var(--border-color-primary)",
@ -105,7 +105,7 @@ def list_model_elements():
'filter_by_loader', 'filter_by_loader',
'loader', 'loader',
'cpu_memory', 'cpu_memory',
'n_gpu_layers', 'gpu_layers',
'threads', 'threads',
'threads_batch', 'threads_batch',
'batch_size', 'batch_size',
@ -192,7 +192,6 @@ def list_interface_input_elements():
'max_new_tokens', 'max_new_tokens',
'prompt_lookup_num_tokens', 'prompt_lookup_num_tokens',
'max_tokens_second', 'max_tokens_second',
'max_updates_second',
'do_sample', 'do_sample',
'dynamic_temperature', 'dynamic_temperature',
'temperature_last', 'temperature_last',

View file

@ -46,8 +46,8 @@ def create_ui():
with gr.Row(): with gr.Row():
with gr.Column(elem_id='chat-col'): with gr.Column(elem_id='chat-col'):
shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
shared.gradio['display'] = gr.JSON(value={}, visible=False) # Hidden buffer shared.gradio['display'] = gr.JSON(value={}, visible=False) # Hidden buffer
shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
with gr.Row(elem_id="chat-input-row"): with gr.Row(elem_id="chat-input-row"):
with gr.Column(scale=1, elem_id='gr-hover-container'): with gr.Column(scale=1, elem_id='gr-hover-container'):
gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover') gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')

View file

@ -14,6 +14,7 @@ from modules.models_settings import (
get_model_metadata, get_model_metadata,
save_instruction_template, save_instruction_template,
save_model_settings, save_model_settings,
update_gpu_layers_and_vram,
update_model_parameters update_model_parameters
) )
from modules.utils import gradio from modules.utils import gradio
@ -26,71 +27,36 @@ def create_ui():
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
with gr.Row(): with gr.Row():
with gr.Column(): shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
with gr.Row(): ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu) shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button', interactive=not mu) shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
with gr.Column(): shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None)
with gr.Row():
shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
with gr.Row():
with gr.Column():
shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None)
with gr.Blocks(): with gr.Blocks():
gr.Markdown("## Main options")
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads) shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags) shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory) shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
with gr.Column(): with gr.Column():
shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.') shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.') shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.') shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).') shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.') shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
# Speculative decoding # Speculative decoding
@ -99,15 +65,50 @@ def create_ui():
shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu) shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')
shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.') shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding. Recommended value: 4.')
shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1') shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.') shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
with gr.Column(): gr.Markdown("## Other options")
with gr.Row(): with gr.Accordion("See more options", open=False, elem_classes='tgw-accordion'):
shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu) with gr.Row():
with gr.Column():
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
with gr.Column():
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
if not shared.args.portable:
with gr.Row():
shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
with gr.Column():
with gr.Tab("Download"): with gr.Tab("Download"):
shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu) shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu)
shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1, interactive=not mu) shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1, interactive=not mu)
@ -132,11 +133,10 @@ def create_event_handlers():
# In this event handler, the interface state is read and updated # In this event handler, the interface state is read and updated
# with the model defaults (if any), and then the model is loaded # with the model defaults (if any), and then the model is loaded
# unless "autoload_model" is unchecked
shared.gradio['model_menu'].change( shared.gradio['model_menu'].change(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then( handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then(
load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=True).success( partial(load_model_wrapper, autoload=False), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False) handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
shared.gradio['load_model'].click( shared.gradio['load_model'].click(
@ -145,15 +145,31 @@ def create_event_handlers():
partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success( partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False) handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False) shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False).then(
partial(update_gpu_layers_and_vram, auto_adjust=True), gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info', 'gpu_layers'), show_progress=False)
shared.gradio['save_model_settings'].click( shared.gradio['save_model_settings'].click(
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False) save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False) # For ctx_size and cache_type - auto-adjust GPU layers
for param in ['ctx_size', 'cache_type']:
shared.gradio[param].change(
partial(update_gpu_layers_and_vram, auto_adjust=True),
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
gradio('vram_info', 'gpu_layers'), show_progress=False)
# For manual gpu_layers changes - only update VRAM
shared.gradio['gpu_layers'].change(
partial(update_gpu_layers_and_vram, auto_adjust=False),
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
gradio('vram_info'), show_progress=False)
if not shared.args.portable:
shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True) shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True) shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))
shared.gradio['customized_template_submit'].click(save_instruction_template, gradio('model_menu', 'customized_template'), gradio('model_status'), show_progress=True) shared.gradio['customized_template_submit'].click(save_instruction_template, gradio('model_menu', 'customized_template'), gradio('model_status'), show_progress=True)
@ -192,6 +208,26 @@ def load_lora_wrapper(selected_loras):
def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False): def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):
try: try:
# Handle direct GGUF URLs
if repo_id.startswith("https://") and ("huggingface.co" in repo_id) and (repo_id.endswith(".gguf") or repo_id.endswith(".gguf?download=true")):
try:
path = repo_id.split("huggingface.co/")[1]
# Extract the repository ID (first two parts of the path)
parts = path.split("/")
if len(parts) >= 2:
extracted_repo_id = f"{parts[0]}/{parts[1]}"
# Extract the filename (last part of the path)
filename = repo_id.split("/")[-1]
if "?download=true" in filename:
filename = filename.replace("?download=true", "")
repo_id = extracted_repo_id
specific_file = filename
except:
pass
if repo_id == "": if repo_id == "":
yield ("Please enter a model path") yield ("Please enter a model path")
return return
@ -205,6 +241,18 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
yield ("Getting the download links from Hugging Face") yield ("Getting the download links from Hugging Face")
links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file) links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file)
# Check for multiple GGUF files
gguf_files = [link for link in links if link.lower().endswith('.gguf')]
if len(gguf_files) > 1 and not specific_file:
output = "Multiple GGUF files found. Please copy one of the following filenames to the 'File name' field:\n\n```\n"
for link in gguf_files:
output += f"{Path(link).name}\n"
output += "```"
yield output
return
if return_links: if return_links:
output = "```\n" output = "```\n"
for link in links: for link in links:
@ -252,10 +300,34 @@ def update_truncation_length(current_length, state):
return current_length return current_length
def get_initial_vram_info():
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
return update_gpu_layers_and_vram(
shared.args.loader,
shared.model_name,
shared.args.gpu_layers,
shared.args.ctx_size,
shared.args.cache_type,
auto_adjust=False,
for_ui=True
)
return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
def get_initial_gpu_layers_max():
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
model_settings = get_model_metadata(shared.model_name)
return model_settings.get('gpu_layers', 256)
return 256
def handle_load_model_event_initial(model, state): def handle_load_model_event_initial(model, state):
state = apply_model_settings_to_state(model, state) state = apply_model_settings_to_state(model, state)
output = ui.apply_interface_values(state) output = ui.apply_interface_values(state)
update_model_parameters(state) update_model_parameters(state) # This updates the command-line flags
return output + [state] return output + [state]

View file

@ -21,7 +21,7 @@ def create_ui(default_preset):
shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button') shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button')
with gr.Column(): with gr.Column():
shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown') shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()) if not shared.args.portable else ['llama.cpp'], value="All", elem_classes='slim-dropdown')
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
@ -71,8 +71,6 @@ def create_ui(default_preset):
shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label='max_new_tokens', info='⚠️ Setting this too high can cause prompt truncation.') shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label='max_new_tokens', info='⚠️ Setting this too high can cause prompt truncation.')
shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.') shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.') shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')
with gr.Column(): with gr.Column():
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
@ -82,7 +80,7 @@ def create_ui(default_preset):
shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.') shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.') shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.') shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='enable_thinking', info='Used by Qwen3 to toggle <think> mode.') shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Qwen3 to toggle <think> mode.')
shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.') shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming') shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.') shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')

View file

@ -23,11 +23,15 @@ def create_ui():
shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table') shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table')
with gr.Column(): with gr.Column():
extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️ WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu) if not shared.args.portable:
extension_status = gr.Markdown() extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️ WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu)
extension_status = gr.Markdown()
else:
pass
shared.gradio['theme_state'] = gr.Textbox(visible=False, value='dark' if shared.settings['dark_theme'] else 'light') shared.gradio['theme_state'] = gr.Textbox(visible=False, value='dark' if shared.settings['dark_theme'] else 'light')
extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False) if not shared.args.portable:
extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False)
# Reset interface event # Reset interface event
shared.gradio['reset_interface'].click( shared.gradio['reset_interface'].click(

View file

@ -72,6 +72,20 @@ def natural_keys(text):
return [atoi(c) for c in re.split(r'(\d+)', text)] return [atoi(c) for c in re.split(r'(\d+)', text)]
def check_model_loaded():
if shared.model_name == 'None' or shared.model is None:
if len(get_available_models()) <= 1:
error_msg = "No model is loaded.\n\nTo get started:\n1) Place a GGUF file in your user_data/models folder\n2) Go to the Model tab and select it"
logger.error(error_msg)
return False, error_msg
else:
error_msg = "No model is loaded. Please select one in the Model tab."
logger.error(error_msg)
return False, error_msg
return True, None
def get_available_models(): def get_available_models():
# Get all GGUF files # Get all GGUF files
gguf_files = get_available_ggufs() gguf_files = get_available_ggufs()

View file

@ -126,7 +126,7 @@ def check_env():
sys.exit(1) sys.exit(1)
# Ensure this is a new environment and not the base environment # Ensure this is a new environment and not the base environment
if os.environ["CONDA_DEFAULT_ENV"] == "base": if os.environ.get("CONDA_DEFAULT_ENV", "") == "base":
print("Create an environment for this project and activate it. Exiting...") print("Create an environment for this project and activate it. Exiting...")
sys.exit(1) sys.exit(1)
@ -222,7 +222,7 @@ def update_pytorch_and_python():
if "+cu" in torver: if "+cu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124" install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
elif "+rocm" in torver: elif "+rocm" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1" install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4"
elif "+cpu" in torver: elif "+cpu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu" install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
elif "+cxx11" in torver: elif "+cxx11" in torver:
@ -273,7 +273,7 @@ def install_webui():
"What is your GPU?", "What is your GPU?",
{ {
'A': 'NVIDIA - CUDA 12.4', 'A': 'NVIDIA - CUDA 12.4',
'B': 'AMD - Linux/macOS only, requires ROCm 6.1', 'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4',
'C': 'Apple M Series', 'C': 'Apple M Series',
'D': 'Intel Arc (beta)', 'D': 'Intel Arc (beta)',
'N': 'CPU mode' 'N': 'CPU mode'
@ -314,7 +314,7 @@ def install_webui():
if selected_gpu == "NVIDIA": if selected_gpu == "NVIDIA":
install_pytorch += "--index-url https://download.pytorch.org/whl/cu124" install_pytorch += "--index-url https://download.pytorch.org/whl/cu124"
elif selected_gpu == "AMD": elif selected_gpu == "AMD":
install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.1" install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.2.4"
elif selected_gpu in ["APPLE", "NONE"]: elif selected_gpu in ["APPLE", "NONE"]:
install_pytorch += "--index-url https://download.pytorch.org/whl/cpu" install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
elif selected_gpu == "INTEL": elif selected_gpu == "INTEL":

View file

@ -30,12 +30,12 @@ sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -29,6 +29,7 @@ sse-starlette==1.6.5
tiktoken tiktoken
# AMD wheels # AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -29,6 +29,7 @@ sse-starlette==1.6.5
tiktoken tiktoken
# AMD wheels # AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -29,7 +29,7 @@ sse-starlette==1.6.5
tiktoken tiktoken
# Mac wheels # Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -29,8 +29,8 @@ sse-starlette==1.6.5
tiktoken tiktoken
# Mac wheels # Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -29,5 +29,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# llama.cpp (CPU only, AVX2) # llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -29,5 +29,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# llama.cpp (CPU only, no AVX2) # llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -30,12 +30,12 @@ sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,18 +0,0 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -1,18 +0,0 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# Mac wheels # Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"

View file

@ -15,6 +15,6 @@ sse-starlette==1.6.5
tiktoken tiktoken
# Mac wheels # Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# llama.cpp (CPU only, AVX2) # llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# llama.cpp (CPU only, no AVX2) # llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -51,6 +51,7 @@ from modules.models import load_model, unload_model_if_idle
from modules.models_settings import ( from modules.models_settings import (
get_fallback_settings, get_fallback_settings,
get_model_metadata, get_model_metadata,
update_gpu_layers_and_vram,
update_model_parameters update_model_parameters
) )
from modules.shared import do_cmd_flags_warnings from modules.shared import do_cmd_flags_warnings
@ -90,7 +91,7 @@ def create_interface():
'instruction_template_str': shared.settings['instruction_template_str'], 'instruction_template_str': shared.settings['instruction_template_str'],
'prompt_menu-default': shared.settings['prompt-default'], 'prompt_menu-default': shared.settings['prompt-default'],
'prompt_menu-notebook': shared.settings['prompt-notebook'], 'prompt_menu-notebook': shared.settings['prompt-notebook'],
'filter_by_loader': shared.args.loader or 'All' 'filter_by_loader': (shared.args.loader or 'All') if not shared.args.portable else 'llama.cpp'
}) })
if Path("user_data/cache/pfp_character.png").exists(): if Path("user_data/cache/pfp_character.png").exists():
@ -127,7 +128,8 @@ def create_interface():
ui_parameters.create_ui(shared.settings['preset']) # Parameters tab ui_parameters.create_ui(shared.settings['preset']) # Parameters tab
ui_model_menu.create_ui() # Model tab ui_model_menu.create_ui() # Model tab
training.create_ui() # Training tab if not shared.args.portable:
training.create_ui() # Training tab
ui_session.create_ui() # Session tab ui_session.create_ui() # Session tab
# Generation events # Generation events
@ -247,6 +249,20 @@ if __name__ == "__main__":
model_settings = get_model_metadata(model_name) model_settings = get_model_metadata(model_name)
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
# Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
vram_usage, adjusted_layers = update_gpu_layers_and_vram(
shared.args.loader,
model_name,
model_settings['gpu_layers'],
shared.args.ctx_size,
shared.args.cache_type,
auto_adjust=True,
for_ui=False
)
shared.args.gpu_layers = adjusted_layers
# Load the model # Load the model
shared.model, shared.tokenizer = load_model(model_name) shared.model, shared.tokenizer = load_model(model_name)
if shared.args.lora: if shared.args.lora:

View file

@ -1,10 +1,15 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
cd "$(dirname "${BASH_SOURCE[0]}")" cd "$(dirname "${BASH_SOURCE[0]}")"
# Portable install case # Portable install case
if [ -d "portable_env" ]; then if [ -d "portable_env" ]; then
./portable_env/bin/python3 server.py --api --auto-launch "$@" ./portable_env/bin/python3 server.py --portable --api --auto-launch "$@"
exit $? exit $?
fi fi
@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
exit exit
fi fi
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
export CUDA_PATH="$INSTALL_ENV_DIR" export CUDA_PATH="$INSTALL_ENV_DIR"
export CUDA_HOME="$CUDA_PATH" export CUDA_HOME="$CUDA_PATH"

View file

@ -1,10 +1,15 @@
#!/bin/bash #!/bin/bash
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
cd "$(dirname "${BASH_SOURCE[0]}")" cd "$(dirname "${BASH_SOURCE[0]}")"
# Portable install case # Portable install case
if [ -d "portable_env" ]; then if [ -d "portable_env" ]; then
./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@" ./portable_env/bin/python3 server.py --portable --api --auto-launch --api-port 5005 "$@"
exit $? exit $?
fi fi
@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
exit exit
fi fi
# environment isolation
export PYTHONNOUSERSITE=1
unset PYTHONPATH
unset PYTHONHOME
export CUDA_PATH="$INSTALL_ENV_DIR" export CUDA_PATH="$INSTALL_ENV_DIR"
export CUDA_HOME="$CUDA_PATH" export CUDA_HOME="$CUDA_PATH"

View file

@ -1,11 +1,16 @@
@echo off @echo off
setlocal enabledelayedexpansion setlocal enabledelayedexpansion
@rem environment isolation
set PYTHONNOUSERSITE=1
set PYTHONPATH=
set PYTHONHOME=
cd /D "%~dp0" cd /D "%~dp0"
@rem Portable install case @rem Portable install case
if exist "portable_env" ( if exist "portable_env" (
.\portable_env\python.exe server.py --api --auto-launch %* .\portable_env\python.exe server.py --portable --api --auto-launch %*
exit /b %errorlevel% exit /b %errorlevel%
) )
@ -87,10 +92,6 @@ if not exist "%INSTALL_ENV_DIR%" (
@rem check if conda environment was actually created @rem check if conda environment was actually created
if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end ) if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
@rem environment isolation
set PYTHONNOUSERSITE=1
set PYTHONPATH=
set PYTHONHOME=
set "CUDA_PATH=%INSTALL_ENV_DIR%" set "CUDA_PATH=%INSTALL_ENV_DIR%"
set "CUDA_HOME=%CUDA_PATH%" set "CUDA_HOME=%CUDA_PATH%"

View file

@ -18,7 +18,6 @@ max_new_tokens_min: 1
max_new_tokens_max: 4096 max_new_tokens_max: 4096
prompt_lookup_num_tokens: 0 prompt_lookup_num_tokens: 0
max_tokens_second: 0 max_tokens_second: 0
max_updates_second: 12
auto_max_new_tokens: true auto_max_new_tokens: true
ban_eos_token: false ban_eos_token: false
add_bos_token: true add_bos_token: true
@ -31,7 +30,6 @@ seed: -1
custom_stopping_strings: '' custom_stopping_strings: ''
custom_token_bans: '' custom_token_bans: ''
negative_prompt: '' negative_prompt: ''
autoload_model: false
dark_theme: true dark_theme: true
default_extensions: [] default_extensions: []
instruction_template_str: |- instruction_template_str: |-