mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-06-08 06:35:57 -04:00
commit
dc3094549e
65 changed files with 955 additions and 406 deletions
|
@ -102,6 +102,8 @@ jobs:
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
|
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
|
||||||
|
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
|
||||||
|
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
|
||||||
|
|
||||||
# Define common variables
|
# Define common variables
|
||||||
CUDA_VERSION="${{ matrix.cuda }}"
|
CUDA_VERSION="${{ matrix.cuda }}"
|
||||||
|
|
|
@ -101,6 +101,8 @@ jobs:
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
|
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
|
||||||
|
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
|
||||||
|
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
|
||||||
|
|
||||||
# Define common variables
|
# Define common variables
|
||||||
AVX_SUPPORT="${{ matrix.avx }}"
|
AVX_SUPPORT="${{ matrix.avx }}"
|
||||||
|
|
2
.github/workflows/build-portable-release.yml
vendored
2
.github/workflows/build-portable-release.yml
vendored
|
@ -101,6 +101,8 @@ jobs:
|
||||||
shell: bash
|
shell: bash
|
||||||
run: |
|
run: |
|
||||||
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
|
rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
|
||||||
|
allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
|
||||||
|
find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
|
||||||
|
|
||||||
# Define common variables
|
# Define common variables
|
||||||
AVX_SUPPORT="${{ matrix.avx }}"
|
AVX_SUPPORT="${{ matrix.avx }}"
|
||||||
|
|
10
README.md
10
README.md
|
@ -22,7 +22,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
|
||||||
- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
|
- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
|
||||||
- Multiple sampling parameters and generation options for sophisticated text generation control.
|
- Multiple sampling parameters and generation options for sophisticated text generation control.
|
||||||
- Switch between different models easily in the UI without restarting, with fine control over settings.
|
- Switch between different models easily in the UI without restarting, with fine control over settings.
|
||||||
- OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
|
- OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
|
||||||
- 100% offline and private, with zero telemetry, external resources, or remote update requests.
|
- 100% offline and private, with zero telemetry, external resources, or remote update requests.
|
||||||
- Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
|
- Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases
|
||||||
|
|
||||||
To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again.
|
To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again.
|
||||||
|
|
||||||
You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
|
You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, or `update_wizard_macos.sh`.
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>
|
<summary>
|
||||||
|
@ -55,12 +55,12 @@ Setup details and information about installing manually
|
||||||
|
|
||||||
The script uses Miniconda to set up a Conda environment in the `installer_files` folder.
|
The script uses Miniconda to set up a Conda environment in the `installer_files` folder.
|
||||||
|
|
||||||
If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, `cmd_macos.sh`, or `cmd_wsl.bat`.
|
If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, or `cmd_macos.sh`.
|
||||||
|
|
||||||
* There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root.
|
* There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root.
|
||||||
* To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts.
|
* To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts.
|
||||||
* For additional instructions about AMD and WSL setup, consult [the documentation](https://github.com/oobabooga/text-generation-webui/wiki).
|
* For additional instructions about AMD and WSL setup, consult [the documentation](https://github.com/oobabooga/text-generation-webui/wiki).
|
||||||
* For automated installation, you can use the `GPU_CHOICE`, `USE_CUDA118`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
|
* For automated installation, you can use the `GPU_CHOICE`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
|
||||||
|
|
||||||
### Manual installation using Conda
|
### Manual installation using Conda
|
||||||
|
|
||||||
|
@ -90,7 +90,7 @@ conda activate textgen
|
||||||
|--------|---------|---------|
|
|--------|---------|---------|
|
||||||
| Linux/WSL | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
|
| Linux/WSL | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
|
||||||
| Linux/WSL | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu` |
|
| Linux/WSL | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu` |
|
||||||
| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.1` |
|
| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.2.4` |
|
||||||
| MacOS + MPS | Any | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
|
| MacOS + MPS | Any | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
|
||||||
| Windows | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
|
| Windows | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
|
||||||
| Windows | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
|
| Windows | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
.message {
|
.message {
|
||||||
display: grid;
|
display: grid;
|
||||||
|
align-items: start;
|
||||||
grid-template-columns: 60px minmax(0, 1fr);
|
grid-template-columns: 60px minmax(0, 1fr);
|
||||||
padding-bottom: 28px;
|
padding-bottom: 14px;
|
||||||
|
padding-top: 14px;
|
||||||
font-size: 18px;
|
font-size: 18px;
|
||||||
font-family: Roboto, Arial, sans-serif; /* Modern font */
|
font-family: Roboto, Arial, sans-serif; /* Modern font */
|
||||||
line-height: 1.5;
|
line-height: 1.5;
|
||||||
|
@ -102,6 +104,7 @@
|
||||||
@media screen and (width <= 688px) {
|
@media screen and (width <= 688px) {
|
||||||
.message {
|
.message {
|
||||||
display: grid;
|
display: grid;
|
||||||
|
align-items: start;
|
||||||
grid-template-columns: 60px minmax(0, 1fr);
|
grid-template-columns: 60px minmax(0, 1fr);
|
||||||
padding-bottom: 25px;
|
padding-bottom: 25px;
|
||||||
font-size: 15px;
|
font-size: 15px;
|
||||||
|
|
|
@ -2,8 +2,10 @@
|
||||||
|
|
||||||
.message {
|
.message {
|
||||||
display: grid;
|
display: grid;
|
||||||
|
align-items: start;
|
||||||
grid-template-columns: 60px minmax(0, 1fr);
|
grid-template-columns: 60px minmax(0, 1fr);
|
||||||
padding-bottom: 28px;
|
padding-bottom: 14px;
|
||||||
|
padding-top: 14px;
|
||||||
font-size: 18px;
|
font-size: 18px;
|
||||||
font-family: 'Noto Sans', Arial, sans-serif;
|
font-family: 'Noto Sans', Arial, sans-serif;
|
||||||
line-height: 1.428571429;
|
line-height: 1.428571429;
|
||||||
|
@ -100,6 +102,7 @@
|
||||||
@media screen and (width <= 688px) {
|
@media screen and (width <= 688px) {
|
||||||
.message {
|
.message {
|
||||||
display: grid;
|
display: grid;
|
||||||
|
align-items: start;
|
||||||
grid-template-columns: 60px minmax(0, 1fr);
|
grid-template-columns: 60px minmax(0, 1fr);
|
||||||
padding-bottom: 25px;
|
padding-bottom: 25px;
|
||||||
font-size: 15px;
|
font-size: 15px;
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
.message {
|
.message {
|
||||||
padding-bottom: 2em;
|
padding-bottom: 1em;
|
||||||
|
padding-top: 1em;
|
||||||
grid-template-columns: 70px minmax(0, 1fr);
|
grid-template-columns: 70px minmax(0, 1fr);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
.message {
|
.message {
|
||||||
display: grid;
|
display: grid;
|
||||||
|
align-items: start;
|
||||||
grid-template-columns: 60px minmax(0, 1fr);
|
grid-template-columns: 60px minmax(0, 1fr);
|
||||||
padding-bottom: 2em;
|
padding-bottom: 1em;
|
||||||
|
padding-top: 1em;
|
||||||
font-size: 15px;
|
font-size: 15px;
|
||||||
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
|
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
|
||||||
line-height: 22.5px !important;
|
line-height: 22.5px !important;
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
.message {
|
.message {
|
||||||
padding-bottom: 25px;
|
padding-bottom: 12.5px;
|
||||||
|
padding-top: 12.5px;
|
||||||
font-size: 15px;
|
font-size: 15px;
|
||||||
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
|
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
|
||||||
line-height: 1.428571429;
|
line-height: 1.428571429;
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
.message {
|
.message {
|
||||||
padding-bottom: 25px;
|
padding-bottom: 12.5px;
|
||||||
|
padding-top: 12.5px;
|
||||||
font-size: 15px;
|
font-size: 15px;
|
||||||
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
|
font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
|
||||||
line-height: 1.428571429;
|
line-height: 1.428571429;
|
||||||
|
|
|
@ -8,10 +8,6 @@
|
||||||
padding-top: 0 !important;
|
padding-top: 0 !important;
|
||||||
}
|
}
|
||||||
|
|
||||||
.chat > .messages > :last-child {
|
|
||||||
margin-bottom: 1.7rem !important;
|
|
||||||
}
|
|
||||||
|
|
||||||
.chat .message-body p, .chat .message-body li {
|
.chat .message-body p, .chat .message-body li {
|
||||||
font-size: 1rem !important;
|
font-size: 1rem !important;
|
||||||
line-height: 28px !important;
|
line-height: 28px !important;
|
||||||
|
@ -46,7 +42,7 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
.chat .user-message {
|
.chat .user-message {
|
||||||
background: #f5f5f5;
|
background: #f3f4f6;
|
||||||
padding: 1.5rem 1rem;
|
padding: 1.5rem 1rem;
|
||||||
padding-bottom: 2rem;
|
padding-bottom: 2rem;
|
||||||
border-radius: 0;
|
border-radius: 0;
|
||||||
|
@ -61,16 +57,16 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
.dark .chat .user-message {
|
.dark .chat .user-message {
|
||||||
background: transparent;
|
background: var(--light-gray);
|
||||||
}
|
}
|
||||||
|
|
||||||
.dark .chat .assistant-message {
|
.dark .chat .assistant-message {
|
||||||
background: var(--light-gray);
|
background: transparent;
|
||||||
}
|
}
|
||||||
|
|
||||||
.chat .user-message .text,
|
.chat .user-message .text,
|
||||||
.chat .assistant-message .text {
|
.chat .assistant-message .text {
|
||||||
max-width: 645px;
|
max-width: 700px;
|
||||||
margin-left: auto;
|
margin-left: auto;
|
||||||
margin-right: auto;
|
margin-right: auto;
|
||||||
}
|
}
|
||||||
|
|
100
css/main.css
100
css/main.css
|
@ -2,7 +2,7 @@
|
||||||
--darker-gray: #202123;
|
--darker-gray: #202123;
|
||||||
--dark-gray: #343541;
|
--dark-gray: #343541;
|
||||||
--light-gray: #444654;
|
--light-gray: #444654;
|
||||||
--light-theme-gray: #f5f5f5;
|
--light-theme-gray: #f9fbff;
|
||||||
--border-color-dark: #525252;
|
--border-color-dark: #525252;
|
||||||
--header-width: 112px;
|
--header-width: 112px;
|
||||||
--selected-item-color-dark: #32333e;
|
--selected-item-color-dark: #32333e;
|
||||||
|
@ -389,7 +389,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
.chat {
|
.chat {
|
||||||
margin-left: auto;
|
margin-left: auto;
|
||||||
margin-right: auto;
|
margin-right: auto;
|
||||||
min-height: var(--chat-height);
|
flex: 1;
|
||||||
overflow-y: auto;
|
overflow-y: auto;
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
|
@ -401,10 +401,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
}
|
}
|
||||||
|
|
||||||
.chat-parent {
|
.chat-parent {
|
||||||
height: calc(100dvh - 98px - var(--input-delta));
|
flex: 1;
|
||||||
overflow: auto !important;
|
overflow: auto !important;
|
||||||
border-radius: 0 !important;
|
border-radius: 0 !important;
|
||||||
margin-bottom: var(--input-delta) !important;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.chat-parent .prose {
|
.chat-parent .prose {
|
||||||
|
@ -421,13 +420,13 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
}
|
}
|
||||||
|
|
||||||
.chat-parent.bigchat {
|
.chat-parent.bigchat {
|
||||||
height: calc(100dvh - 98px - var(--input-delta)) !important;
|
flex: 1;
|
||||||
margin-bottom: var(--input-delta) !important;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.chat > .messages {
|
.chat > .messages {
|
||||||
display: flex;
|
display: flex;
|
||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
|
min-height: calc(100vh - 102px);
|
||||||
}
|
}
|
||||||
|
|
||||||
.chat > .messages > :first-child {
|
.chat > .messages > :first-child {
|
||||||
|
@ -546,7 +545,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
border-radius: 5px;
|
border-radius: 5px;
|
||||||
font-size: 82%;
|
font-size: 82%;
|
||||||
padding: 1px 3px;
|
padding: 1px 3px;
|
||||||
background: white !important;
|
background: #f3f4f6 !important;
|
||||||
color: #1f2328;
|
color: #1f2328;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -560,18 +559,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
padding: 15px;
|
padding: 15px;
|
||||||
}
|
}
|
||||||
|
|
||||||
.message-body :not(pre) > code::before {
|
|
||||||
content: "`";
|
|
||||||
}
|
|
||||||
|
|
||||||
.message-body :not(pre) > code::after {
|
|
||||||
content: "`";
|
|
||||||
}
|
|
||||||
|
|
||||||
.message-body :not(pre) > code {
|
.message-body :not(pre) > code {
|
||||||
white-space: normal !important;
|
white-space: normal !important;
|
||||||
font-weight: bold;
|
font-weight: bold;
|
||||||
font-family: unset;
|
font-size: 0.95em;
|
||||||
|
font-family: Menlo,"Roboto Mono","Courier New",Courier,monospace,Inter,sans-serif;
|
||||||
|
padding: .15rem .3rem;
|
||||||
|
background-color: #ececec;
|
||||||
|
}
|
||||||
|
|
||||||
|
.dark .message-body :not(pre) > code {
|
||||||
|
background-color: rgb(255 255 255 / 10%);
|
||||||
}
|
}
|
||||||
|
|
||||||
#chat-input {
|
#chat-input {
|
||||||
|
@ -582,7 +580,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
}
|
}
|
||||||
|
|
||||||
#chat-input textarea {
|
#chat-input textarea {
|
||||||
|
background: #f3f4f6;
|
||||||
padding: 0.65rem 2.5rem;
|
padding: 0.65rem 2.5rem;
|
||||||
|
border: 0;
|
||||||
|
box-shadow: 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#chat-input textarea::placeholder {
|
#chat-input textarea::placeholder {
|
||||||
|
@ -603,8 +604,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
}
|
}
|
||||||
|
|
||||||
.chat-input-positioned {
|
.chat-input-positioned {
|
||||||
position: absolute;
|
|
||||||
bottom: 0;
|
|
||||||
max-width: 54rem;
|
max-width: 54rem;
|
||||||
left: 50%;
|
left: 50%;
|
||||||
transform: translateX(-50%);
|
transform: translateX(-50%);
|
||||||
|
@ -744,7 +743,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
|
|
||||||
.hover-menu button {
|
.hover-menu button {
|
||||||
width: 100%;
|
width: 100%;
|
||||||
background: transparent !important;
|
background: white !important;
|
||||||
border-radius: 0 !important;
|
border-radius: 0 !important;
|
||||||
justify-content: space-between;
|
justify-content: space-between;
|
||||||
margin: 0 !important;
|
margin: 0 !important;
|
||||||
|
@ -760,7 +759,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
}
|
}
|
||||||
|
|
||||||
.hover-menu button:hover {
|
.hover-menu button:hover {
|
||||||
background: var(--button-secondary-background-fill-hover) !important;
|
background: #dbeafe !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
.dark .hover-menu button:hover {
|
||||||
|
background: var(--selected-item-color-dark) !important;
|
||||||
}
|
}
|
||||||
|
|
||||||
.transparent-substring {
|
.transparent-substring {
|
||||||
|
@ -789,6 +792,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
}
|
}
|
||||||
|
|
||||||
#chat-input-container {
|
#chat-input-container {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
min-width: 0 !important;
|
min-width: 0 !important;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -798,9 +803,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
}
|
}
|
||||||
|
|
||||||
#chat-input-row {
|
#chat-input-row {
|
||||||
padding-bottom: 1.5em;
|
padding: 1rem;
|
||||||
padding-left: 1rem;
|
padding-top: 0;
|
||||||
padding-right: 1rem;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#chat-input-row.bigchat {
|
#chat-input-row.bigchat {
|
||||||
|
@ -808,22 +812,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
}
|
}
|
||||||
|
|
||||||
#chat-col {
|
#chat-col {
|
||||||
padding-bottom: 100px;
|
height: 100dvh;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
padding-bottom: 0;
|
||||||
|
gap: 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@media screen and (width <= 924px) {
|
@media screen and (width <= 924px) {
|
||||||
#chat-col {
|
#chat-col {
|
||||||
padding-bottom: 100px;
|
|
||||||
margin-top: 32px;
|
margin-top: 32px;
|
||||||
position: relative; /* Ensure positioning for the pseudo-element */
|
height: calc(100dvh - 32px);
|
||||||
}
|
|
||||||
|
|
||||||
.chat-parent {
|
|
||||||
height: calc(100dvh - 98px - var(--input-delta) - 32px);
|
|
||||||
}
|
|
||||||
|
|
||||||
.chat-parent.bigchat {
|
|
||||||
height: calc(100dvh - 98px - var(--input-delta) - 32px) !important;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -985,6 +984,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
cursor: pointer;
|
cursor: pointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#past-chats .selected,
|
||||||
|
#past-chats label:hover {
|
||||||
|
background-color: #dbeafe !important;
|
||||||
|
}
|
||||||
|
|
||||||
#past-chats-buttons,
|
#past-chats-buttons,
|
||||||
#delete-chat-row,
|
#delete-chat-row,
|
||||||
#rename-row {
|
#rename-row {
|
||||||
|
@ -993,7 +997,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
gap: 9px;
|
gap: 9px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#past-chats-row,
|
#past-chats-row,
|
||||||
#chat-controls {
|
#chat-controls {
|
||||||
width: 260px;
|
width: 260px;
|
||||||
|
@ -1111,12 +1114,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
color: #9ca3af;
|
color: #9ca3af;
|
||||||
}
|
}
|
||||||
|
|
||||||
.dark .hover-menu {
|
|
||||||
background-color: var(--darker-gray);
|
|
||||||
}
|
|
||||||
|
|
||||||
.dark .hover-menu button {
|
.dark .hover-menu button {
|
||||||
border-color: var(--border-color-primary);
|
border-color: var(--border-color-primary);
|
||||||
|
background-color: var(--darker-gray) !important;
|
||||||
}
|
}
|
||||||
|
|
||||||
.dark #chat-controls,
|
.dark #chat-controls,
|
||||||
|
@ -1125,8 +1125,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
border: 0 !important;
|
border: 0 !important;
|
||||||
}
|
}
|
||||||
|
|
||||||
.dark #past-chats .selected,
|
.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats .selected,
|
||||||
.dark #past-chats label:hover {
|
.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats label:hover {
|
||||||
background-color: var(--selected-item-color-dark) !important;
|
background-color: var(--selected-item-color-dark) !important;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1163,7 +1163,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
}
|
}
|
||||||
|
|
||||||
.header_bar button.selected {
|
.header_bar button.selected {
|
||||||
background: #E0E0E0;
|
background: #dbeafe;
|
||||||
}
|
}
|
||||||
|
|
||||||
#chat-controls,
|
#chat-controls,
|
||||||
|
@ -1382,3 +1382,19 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
|
||||||
50% { opacity: 1; }
|
50% { opacity: 1; }
|
||||||
100% { opacity: 0.6; }
|
100% { opacity: 0.6; }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
strong {
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
|
||||||
|
.min.svelte-1ybaih5 {
|
||||||
|
min-height: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#vram-info .value {
|
||||||
|
color: #008d00;
|
||||||
|
}
|
||||||
|
|
||||||
|
.dark #vram-info .value {
|
||||||
|
color: #07ff07;
|
||||||
|
}
|
||||||
|
|
|
@ -22,7 +22,7 @@ services:
|
||||||
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
|
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
|
||||||
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
|
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
|
||||||
APP_GID: ${APP_GID:-6972}
|
APP_GID: ${APP_GID:-6972}
|
||||||
APP_UID: ${APP_UID-6972}
|
APP_UID: ${APP_UID:-6972}
|
||||||
env_file: .env
|
env_file: .env
|
||||||
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
|
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
|
||||||
ports:
|
ports:
|
||||||
|
|
|
@ -22,7 +22,7 @@ services:
|
||||||
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
|
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
|
||||||
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
|
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
|
||||||
APP_GID: ${APP_GID:-6972}
|
APP_GID: ${APP_GID:-6972}
|
||||||
APP_UID: ${APP_UID-6972}
|
APP_UID: ${APP_UID:-6972}
|
||||||
env_file: .env
|
env_file: .env
|
||||||
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
|
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
|
||||||
ports:
|
ports:
|
||||||
|
|
|
@ -22,7 +22,7 @@ services:
|
||||||
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
|
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
|
||||||
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
|
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
|
||||||
APP_GID: ${APP_GID:-6972}
|
APP_GID: ${APP_GID:-6972}
|
||||||
APP_UID: ${APP_UID-6972}
|
APP_UID: ${APP_UID:-6972}
|
||||||
env_file: .env
|
env_file: .env
|
||||||
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
|
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
|
||||||
ports:
|
ports:
|
||||||
|
|
|
@ -14,7 +14,7 @@ WORKDIR /home/app/
|
||||||
RUN git clone https://github.com/oobabooga/text-generation-webui.git
|
RUN git clone https://github.com/oobabooga/text-generation-webui.git
|
||||||
WORKDIR /home/app/text-generation-webui
|
WORKDIR /home/app/text-generation-webui
|
||||||
RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
|
RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
|
||||||
COPY CMD_FLAGS.txt /home/app/text-generation-webui/
|
COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
|
||||||
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
|
EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
|
||||||
WORKDIR /home/app/text-generation-webui
|
WORKDIR /home/app/text-generation-webui
|
||||||
# set umask to ensure group read / write at runtime
|
# set umask to ensure group read / write at runtime
|
||||||
|
|
|
@ -22,7 +22,7 @@ services:
|
||||||
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
|
TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5}
|
||||||
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
|
BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
|
||||||
APP_GID: ${APP_GID:-6972}
|
APP_GID: ${APP_GID:-6972}
|
||||||
APP_UID: ${APP_UID-6972}
|
APP_UID: ${APP_UID:-6972}
|
||||||
env_file: .env
|
env_file: .env
|
||||||
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
|
user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
|
||||||
ports:
|
ports:
|
||||||
|
@ -31,17 +31,7 @@ services:
|
||||||
stdin_open: true
|
stdin_open: true
|
||||||
tty: true
|
tty: true
|
||||||
volumes:
|
volumes:
|
||||||
- ./cache:/home/app/text-generation-webui/cache
|
- ./user_data:/home/app/text-generation-webui/user_data
|
||||||
- ./characters:/home/app/text-generation-webui/characters
|
|
||||||
- ./extensions:/home/app/text-generation-webui/extensions
|
|
||||||
- ./loras:/home/app/text-generation-webui/loras
|
|
||||||
- ./logs:/home/app/text-generation-webui/logs
|
|
||||||
- ./models:/home/app/text-generation-webui/models
|
|
||||||
- ./presets:/home/app/text-generation-webui/presets
|
|
||||||
- ./prompts:/home/app/text-generation-webui/prompts
|
|
||||||
- ./softprompts:/home/app/text-generation-webui/softprompts
|
|
||||||
- ./training:/home/app/text-generation-webui/training
|
|
||||||
- ./cloudflared:/etc/cloudflared
|
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
reservations:
|
reservations:
|
||||||
|
|
|
@ -257,6 +257,85 @@ headers = {
|
||||||
|
|
||||||
in any of the examples above.
|
in any of the examples above.
|
||||||
|
|
||||||
|
#### Tool/Function Calling Example
|
||||||
|
|
||||||
|
You need to use a model with tools support. The prompt will be automatically formatted using the model's Jinja2 template.
|
||||||
|
|
||||||
|
Request:
|
||||||
|
|
||||||
|
```
|
||||||
|
curl http://127.0.0.1:5000/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a helpful assistant."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "What time is it currently in New York City?"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"tools": [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_current_time",
|
||||||
|
"description": "Get current time in a specific timezones",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"required": ["timezone"],
|
||||||
|
"properties": {
|
||||||
|
"timezone": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "IANA timezone name (e.g., America/New_York, Europe/London). Use Europe/Berlin as local timezone if no timezone provided by the user."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Sample response:
|
||||||
|
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"id": "chatcmpl-1746532051477984256",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"created": 1746532051,
|
||||||
|
"model": "qwen2.5-coder-14b-instruct-q4_k_m.gguf",
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"finish_reason": "tool_calls",
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "```xml\n<function>\n{\n \"name\": \"get_current_time\",\n \"arguments\": {\n \"timezone\": \"America/New_York\"\n }\n}\n</function>\n```"
|
||||||
|
},
|
||||||
|
"tool_calls": [
|
||||||
|
{
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "get_current_time",
|
||||||
|
"arguments": "{\"timezone\": \"America/New_York\"}"
|
||||||
|
},
|
||||||
|
"id": "call_52ij07mh",
|
||||||
|
"index": "0"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": 224,
|
||||||
|
"completion_tokens": 38,
|
||||||
|
"total_tokens": 262
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
### Environment variables
|
### Environment variables
|
||||||
|
|
||||||
The following environment variables can be used (they take precedence over everything else):
|
The following environment variables can be used (they take precedence over everything else):
|
||||||
|
|
|
@ -1,16 +1,14 @@
|
||||||
import base64
|
|
||||||
import copy
|
import copy
|
||||||
import re
|
import json
|
||||||
import time
|
import time
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from io import BytesIO
|
|
||||||
|
|
||||||
import requests
|
|
||||||
import tiktoken
|
import tiktoken
|
||||||
from PIL import Image
|
from pydantic import ValidationError
|
||||||
|
|
||||||
from extensions.openai.errors import InvalidRequestError
|
from extensions.openai.errors import InvalidRequestError
|
||||||
from extensions.openai.utils import debug_msg
|
from extensions.openai.typing import ToolDefinition
|
||||||
|
from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
|
||||||
from modules import shared
|
from modules import shared
|
||||||
from modules.chat import (
|
from modules.chat import (
|
||||||
generate_chat_prompt,
|
generate_chat_prompt,
|
||||||
|
@ -96,72 +94,32 @@ def convert_history(history):
|
||||||
user_input_last = True
|
user_input_last = True
|
||||||
system_message = ""
|
system_message = ""
|
||||||
|
|
||||||
# Multimodal: convert OpenAI format to multimodal extension format
|
|
||||||
if any('content' in entry and isinstance(entry['content'], list) for entry in history):
|
|
||||||
new_history = []
|
|
||||||
for entry in history:
|
|
||||||
if isinstance(entry['content'], list):
|
|
||||||
for item in entry['content']:
|
|
||||||
if not isinstance(item, dict):
|
|
||||||
continue
|
|
||||||
|
|
||||||
image_url = None
|
|
||||||
content = None
|
|
||||||
if item['type'] == 'image_url' and isinstance(item['image_url'], dict):
|
|
||||||
image_url = item['image_url']['url']
|
|
||||||
elif item['type'] == 'text' and isinstance(item['text'], str):
|
|
||||||
content = item['text']
|
|
||||||
if image_url:
|
|
||||||
new_history.append({"image_url": image_url, "role": "user"})
|
|
||||||
if content:
|
|
||||||
new_history.append({"content": content, "role": "user"})
|
|
||||||
else:
|
|
||||||
new_history.append(entry)
|
|
||||||
|
|
||||||
history = new_history
|
|
||||||
|
|
||||||
for entry in history:
|
for entry in history:
|
||||||
if "image_url" in entry:
|
content = entry["content"]
|
||||||
image_url = entry['image_url']
|
|
||||||
if "base64" in image_url:
|
|
||||||
image_url = re.sub('^data:image/.+;base64,', '', image_url)
|
|
||||||
img = Image.open(BytesIO(base64.b64decode(image_url)))
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
my_res = requests.get(image_url)
|
|
||||||
img = Image.open(BytesIO(my_res.content))
|
|
||||||
except Exception:
|
|
||||||
raise 'Image cannot be loaded from the URL!'
|
|
||||||
|
|
||||||
buffered = BytesIO()
|
|
||||||
if img.mode in ("RGBA", "P"):
|
|
||||||
img = img.convert("RGB")
|
|
||||||
|
|
||||||
img.save(buffered, format="JPEG")
|
|
||||||
img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
|
||||||
content = f'<img src="data:image/jpeg;base64,{img_str}">'
|
|
||||||
else:
|
|
||||||
content = entry["content"]
|
|
||||||
|
|
||||||
role = entry["role"]
|
role = entry["role"]
|
||||||
|
|
||||||
if role == "user":
|
if role == "user":
|
||||||
user_input = content
|
user_input = content
|
||||||
user_input_last = True
|
user_input_last = True
|
||||||
if current_message:
|
if current_message:
|
||||||
chat_dialogue.append([current_message, ''])
|
chat_dialogue.append([current_message, '', ''])
|
||||||
current_message = ""
|
current_message = ""
|
||||||
|
|
||||||
current_message = content
|
current_message = content
|
||||||
elif role == "assistant":
|
elif role == "assistant":
|
||||||
|
if "tool_calls" in entry and isinstance(entry["tool_calls"], list) and len(entry["tool_calls"]) > 0 and content.strip() == "":
|
||||||
|
continue # skip tool calls
|
||||||
current_reply = content
|
current_reply = content
|
||||||
user_input_last = False
|
user_input_last = False
|
||||||
if current_message:
|
if current_message:
|
||||||
chat_dialogue.append([current_message, current_reply])
|
chat_dialogue.append([current_message, current_reply, ''])
|
||||||
current_message = ""
|
current_message = ""
|
||||||
current_reply = ""
|
current_reply = ""
|
||||||
else:
|
else:
|
||||||
chat_dialogue.append(['', current_reply])
|
chat_dialogue.append(['', current_reply, ''])
|
||||||
|
elif role == "tool":
|
||||||
|
user_input_last = False
|
||||||
|
chat_dialogue.append(['', '', content])
|
||||||
elif role == "system":
|
elif role == "system":
|
||||||
system_message += f"\n{content}" if system_message else content
|
system_message += f"\n{content}" if system_message else content
|
||||||
|
|
||||||
|
@ -181,6 +139,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
|
||||||
if 'messages' not in body:
|
if 'messages' not in body:
|
||||||
raise InvalidRequestError(message="messages is required", param='messages')
|
raise InvalidRequestError(message="messages is required", param='messages')
|
||||||
|
|
||||||
|
tools = None
|
||||||
|
if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and len(body['tools']) > 0:
|
||||||
|
tools = validateTools(body['tools']) # raises InvalidRequestError if validation fails
|
||||||
|
|
||||||
messages = body['messages']
|
messages = body['messages']
|
||||||
for m in messages:
|
for m in messages:
|
||||||
if 'role' not in m:
|
if 'role' not in m:
|
||||||
|
@ -238,6 +200,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
|
||||||
'custom_system_message': custom_system_message,
|
'custom_system_message': custom_system_message,
|
||||||
'chat_template_str': chat_template_str,
|
'chat_template_str': chat_template_str,
|
||||||
'chat-instruct_command': chat_instruct_command,
|
'chat-instruct_command': chat_instruct_command,
|
||||||
|
'tools': tools,
|
||||||
'history': history,
|
'history': history,
|
||||||
'stream': stream
|
'stream': stream
|
||||||
})
|
})
|
||||||
|
@ -250,7 +213,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
|
||||||
requested_model = generate_params.pop('model')
|
requested_model = generate_params.pop('model')
|
||||||
logprob_proc = generate_params.pop('logprob_proc', None)
|
logprob_proc = generate_params.pop('logprob_proc', None)
|
||||||
|
|
||||||
def chat_streaming_chunk(content):
|
def chat_streaming_chunk(content, chunk_tool_calls=None):
|
||||||
# begin streaming
|
# begin streaming
|
||||||
chunk = {
|
chunk = {
|
||||||
"id": cmpl_id,
|
"id": cmpl_id,
|
||||||
|
@ -260,7 +223,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
|
||||||
resp_list: [{
|
resp_list: [{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
"finish_reason": None,
|
"finish_reason": None,
|
||||||
"delta": {'role': 'assistant', 'content': content},
|
"delta": {'role': 'assistant', 'content': content, 'tool_calls': chunk_tool_calls},
|
||||||
}],
|
}],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -269,6 +232,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
|
||||||
chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
|
chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
|
||||||
# else:
|
# else:
|
||||||
# chunk[resp_list][0]["logprobs"] = None
|
# chunk[resp_list][0]["logprobs"] = None
|
||||||
|
|
||||||
return chunk
|
return chunk
|
||||||
|
|
||||||
# generate reply #######################################
|
# generate reply #######################################
|
||||||
|
@ -277,8 +241,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
|
||||||
yield {'prompt': prompt}
|
yield {'prompt': prompt}
|
||||||
return
|
return
|
||||||
|
|
||||||
debug_msg({'prompt': prompt, 'generate_params': generate_params})
|
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
yield chat_streaming_chunk('')
|
yield chat_streaming_chunk('')
|
||||||
|
|
||||||
|
@ -288,8 +250,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
|
||||||
answer = ''
|
answer = ''
|
||||||
seen_content = ''
|
seen_content = ''
|
||||||
|
|
||||||
|
tool_calls = []
|
||||||
|
end_last_tool_call = 0
|
||||||
|
supported_tools = [x["function"]["name"] for x in tools] if tools is not None else None
|
||||||
|
|
||||||
for a in generator:
|
for a in generator:
|
||||||
answer = a['internal'][-1][1]
|
answer = a['internal'][-1][1]
|
||||||
|
|
||||||
|
if supported_tools is not None:
|
||||||
|
tool_call = parseToolCall(answer[end_last_tool_call:], supported_tools) if len(answer) > 0 else []
|
||||||
|
if len(tool_call) > 0:
|
||||||
|
for tc in tool_call:
|
||||||
|
tc["id"] = getToolCallId()
|
||||||
|
tc["index"] = str(len(tool_calls))
|
||||||
|
tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"])
|
||||||
|
tool_calls.append(tc)
|
||||||
|
end_last_tool_call = len(answer)
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
len_seen = len(seen_content)
|
len_seen = len(seen_content)
|
||||||
new_content = answer[len_seen:]
|
new_content = answer[len_seen:]
|
||||||
|
@ -297,18 +274,25 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
|
||||||
if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet.
|
if not new_content or chr(0xfffd) in new_content: # partial unicode character, don't send it yet.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
seen_content = answer
|
|
||||||
chunk = chat_streaming_chunk(new_content)
|
chunk = chat_streaming_chunk(new_content)
|
||||||
|
|
||||||
|
seen_content = answer
|
||||||
yield chunk
|
yield chunk
|
||||||
|
|
||||||
|
# stop generation if tool_calls were generated previously
|
||||||
|
if len(tool_calls) > 0:
|
||||||
|
break
|
||||||
|
|
||||||
token_count = len(encode(prompt)[0])
|
token_count = len(encode(prompt)[0])
|
||||||
completion_token_count = len(encode(answer)[0])
|
completion_token_count = len(encode(answer)[0])
|
||||||
stop_reason = "stop"
|
stop_reason = "stop"
|
||||||
|
if len(tool_calls) > 0:
|
||||||
|
stop_reason = "tool_calls"
|
||||||
if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']:
|
if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']:
|
||||||
stop_reason = "length"
|
stop_reason = "length"
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
chunk = chat_streaming_chunk('')
|
chunk = chat_streaming_chunk('', tool_calls)
|
||||||
chunk[resp_list][0]['finish_reason'] = stop_reason
|
chunk[resp_list][0]['finish_reason'] = stop_reason
|
||||||
chunk['usage'] = {
|
chunk['usage'] = {
|
||||||
"prompt_tokens": token_count,
|
"prompt_tokens": token_count,
|
||||||
|
@ -326,7 +310,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
|
||||||
resp_list: [{
|
resp_list: [{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
"finish_reason": stop_reason,
|
"finish_reason": stop_reason,
|
||||||
"message": {"role": "assistant", "content": answer}
|
"message": {"role": "assistant", "content": answer},
|
||||||
|
"tool_calls": tool_calls
|
||||||
}],
|
}],
|
||||||
"usage": {
|
"usage": {
|
||||||
"prompt_tokens": token_count,
|
"prompt_tokens": token_count,
|
||||||
|
@ -515,3 +500,19 @@ def completions(body: dict, is_legacy: bool = False) -> dict:
|
||||||
def stream_completions(body: dict, is_legacy: bool = False):
|
def stream_completions(body: dict, is_legacy: bool = False):
|
||||||
for resp in completions_common(body, is_legacy, stream=True):
|
for resp in completions_common(body, is_legacy, stream=True):
|
||||||
yield resp
|
yield resp
|
||||||
|
|
||||||
|
|
||||||
|
def validateTools(tools: list[dict]):
|
||||||
|
# Validate each tool definition in the JSON array
|
||||||
|
valid_tools = None
|
||||||
|
for idx in range(len(tools)):
|
||||||
|
tool = tools[idx]
|
||||||
|
try:
|
||||||
|
tool_definition = ToolDefinition(**tool)
|
||||||
|
if valid_tools is None:
|
||||||
|
valid_tools = []
|
||||||
|
valid_tools.append(tool)
|
||||||
|
except ValidationError:
|
||||||
|
raise InvalidRequestError(message=f"Invalid tool specification at index {idx}.", param='tools')
|
||||||
|
|
||||||
|
return valid_tools
|
||||||
|
|
|
@ -14,6 +14,7 @@ from fastapi.requests import Request
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
from sse_starlette import EventSourceResponse
|
from sse_starlette import EventSourceResponse
|
||||||
|
from starlette.concurrency import iterate_in_threadpool
|
||||||
|
|
||||||
import extensions.openai.completions as OAIcompletions
|
import extensions.openai.completions as OAIcompletions
|
||||||
import extensions.openai.images as OAIimages
|
import extensions.openai.images as OAIimages
|
||||||
|
@ -115,7 +116,7 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
|
||||||
async def generator():
|
async def generator():
|
||||||
async with streaming_semaphore:
|
async with streaming_semaphore:
|
||||||
response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
|
response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
|
||||||
for resp in response:
|
async for resp in iterate_in_threadpool(response):
|
||||||
disconnected = await request.is_disconnected()
|
disconnected = await request.is_disconnected()
|
||||||
if disconnected:
|
if disconnected:
|
||||||
break
|
break
|
||||||
|
@ -125,7 +126,12 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
|
||||||
return EventSourceResponse(generator()) # SSE streaming
|
return EventSourceResponse(generator()) # SSE streaming
|
||||||
|
|
||||||
else:
|
else:
|
||||||
response = OAIcompletions.completions(to_dict(request_data), is_legacy=is_legacy)
|
response = await asyncio.to_thread(
|
||||||
|
OAIcompletions.completions,
|
||||||
|
to_dict(request_data),
|
||||||
|
is_legacy=is_legacy
|
||||||
|
)
|
||||||
|
|
||||||
return JSONResponse(response)
|
return JSONResponse(response)
|
||||||
|
|
||||||
|
|
||||||
|
@ -138,7 +144,7 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
|
||||||
async def generator():
|
async def generator():
|
||||||
async with streaming_semaphore:
|
async with streaming_semaphore:
|
||||||
response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
|
response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
|
||||||
for resp in response:
|
async for resp in iterate_in_threadpool(response):
|
||||||
disconnected = await request.is_disconnected()
|
disconnected = await request.is_disconnected()
|
||||||
if disconnected:
|
if disconnected:
|
||||||
break
|
break
|
||||||
|
@ -148,7 +154,12 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
|
||||||
return EventSourceResponse(generator()) # SSE streaming
|
return EventSourceResponse(generator()) # SSE streaming
|
||||||
|
|
||||||
else:
|
else:
|
||||||
response = OAIcompletions.chat_completions(to_dict(request_data), is_legacy=is_legacy)
|
response = await asyncio.to_thread(
|
||||||
|
OAIcompletions.chat_completions,
|
||||||
|
to_dict(request_data),
|
||||||
|
is_legacy=is_legacy
|
||||||
|
)
|
||||||
|
|
||||||
return JSONResponse(response)
|
return JSONResponse(response)
|
||||||
|
|
||||||
|
|
||||||
|
@ -436,7 +447,7 @@ def run_server():
|
||||||
|
|
||||||
# Start server
|
# Start server
|
||||||
logging.getLogger("uvicorn.error").propagate = False
|
logging.getLogger("uvicorn.error").propagate = False
|
||||||
uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
|
uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False)
|
||||||
|
|
||||||
|
|
||||||
def setup():
|
def setup():
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
from typing import Dict, List
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field, validator
|
||||||
|
|
||||||
|
|
||||||
class GenerationOptions(BaseModel):
|
class GenerationOptions(BaseModel):
|
||||||
|
@ -54,6 +54,48 @@ class GenerationOptions(BaseModel):
|
||||||
grammar_string: str = ""
|
grammar_string: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
class ToolDefinition(BaseModel):
|
||||||
|
function: 'ToolFunction'
|
||||||
|
type: str
|
||||||
|
|
||||||
|
|
||||||
|
class ToolFunction(BaseModel):
|
||||||
|
description: str
|
||||||
|
name: str
|
||||||
|
parameters: 'ToolParameters'
|
||||||
|
|
||||||
|
|
||||||
|
class ToolParameters(BaseModel):
|
||||||
|
properties: Optional[Dict[str, 'ToolProperty']] = None
|
||||||
|
required: Optional[list[str]] = None
|
||||||
|
type: str
|
||||||
|
description: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ToolProperty(BaseModel):
|
||||||
|
description: Optional[str] = None
|
||||||
|
type: Optional[str] = None # we are faced with definitions like anyOf, e.g. {'type': 'function', 'function': {'name': 'git_create_branch', 'description': 'Creates a new branch from an optional base branch', 'parameters': {'type': 'object', 'properties': {'repo_path': {'title': 'Repo Path', 'type': 'string'}, 'branch_name': {'title': 'Branch Name', 'type': 'string'}, 'base_branch': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'default': None, 'title': 'Base Branch'}}, 'required': ['repo_path', 'branch_name'], 'title': 'GitCreateBranch'}}}
|
||||||
|
|
||||||
|
|
||||||
|
class FunctionCall(BaseModel):
|
||||||
|
name: str
|
||||||
|
arguments: Optional[str] = None
|
||||||
|
parameters: Optional[str] = None
|
||||||
|
|
||||||
|
@validator('arguments', allow_reuse=True)
|
||||||
|
def checkPropertyArgsOrParams(cls, v, values, **kwargs):
|
||||||
|
if not v and not values.get('parameters'):
|
||||||
|
raise ValueError("At least one of 'arguments' or 'parameters' must be provided as property in FunctionCall type")
|
||||||
|
return v
|
||||||
|
|
||||||
|
|
||||||
|
class ToolCall(BaseModel):
|
||||||
|
id: str
|
||||||
|
index: int
|
||||||
|
type: str
|
||||||
|
function: FunctionCall
|
||||||
|
|
||||||
|
|
||||||
class CompletionRequestParams(BaseModel):
|
class CompletionRequestParams(BaseModel):
|
||||||
model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.")
|
model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.")
|
||||||
prompt: str | List[str]
|
prompt: str | List[str]
|
||||||
|
@ -92,6 +134,7 @@ class ChatCompletionRequestParams(BaseModel):
|
||||||
frequency_penalty: float | None = 0
|
frequency_penalty: float | None = 0
|
||||||
function_call: str | dict | None = Field(default=None, description="Unused parameter.")
|
function_call: str | dict | None = Field(default=None, description="Unused parameter.")
|
||||||
functions: List[dict] | None = Field(default=None, description="Unused parameter.")
|
functions: List[dict] | None = Field(default=None, description="Unused parameter.")
|
||||||
|
tools: List[dict] | None = Field(default=None, description="Tools signatures passed via MCP.")
|
||||||
logit_bias: dict | None = None
|
logit_bias: dict | None = None
|
||||||
max_tokens: int | None = None
|
max_tokens: int | None = None
|
||||||
n: int | None = Field(default=1, description="Unused parameter.")
|
n: int | None = Field(default=1, description="Unused parameter.")
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
import base64
|
import base64
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from typing import Callable, Optional
|
from typing import Callable, Optional
|
||||||
|
@ -52,3 +55,94 @@ def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_star
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
raise Exception('Could not start cloudflared.')
|
raise Exception('Could not start cloudflared.')
|
||||||
|
|
||||||
|
|
||||||
|
def getToolCallId() -> str:
|
||||||
|
letter_bytes = "abcdefghijklmnopqrstuvwxyz0123456789"
|
||||||
|
b = [random.choice(letter_bytes) for _ in range(8)]
|
||||||
|
return "call_" + "".join(b).lower()
|
||||||
|
|
||||||
|
|
||||||
|
def checkAndSanitizeToolCallCandidate(candidate_dict: dict, tool_names: list[str]):
|
||||||
|
# check if property 'function' exists and is a dictionary, otherwise adapt dict
|
||||||
|
if 'function' not in candidate_dict and 'name' in candidate_dict and isinstance(candidate_dict['name'], str):
|
||||||
|
candidate_dict = {"type": "function", "function": candidate_dict}
|
||||||
|
if 'function' in candidate_dict and isinstance(candidate_dict['function'], str):
|
||||||
|
candidate_dict['name'] = candidate_dict['function']
|
||||||
|
del candidate_dict['function']
|
||||||
|
candidate_dict = {"type": "function", "function": candidate_dict}
|
||||||
|
if 'function' in candidate_dict and isinstance(candidate_dict['function'], dict):
|
||||||
|
# check if 'name' exists within 'function' and is part of known tools
|
||||||
|
if 'name' in candidate_dict['function'] and candidate_dict['function']['name'] in tool_names:
|
||||||
|
candidate_dict["type"] = "function" # ensure required property 'type' exists and has the right value
|
||||||
|
# map property 'parameters' used by some older models to 'arguments'
|
||||||
|
if "arguments" not in candidate_dict["function"] and "parameters" in candidate_dict["function"]:
|
||||||
|
candidate_dict["function"]["arguments"] = candidate_dict["function"]["parameters"]
|
||||||
|
del candidate_dict["function"]["parameters"]
|
||||||
|
return candidate_dict
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def parseToolCall(answer: str, tool_names: list[str]):
|
||||||
|
matches = []
|
||||||
|
|
||||||
|
# abort on very short answers to save computation cycles
|
||||||
|
if len(answer) < 10:
|
||||||
|
return matches
|
||||||
|
|
||||||
|
# Define the regex pattern to find the JSON content wrapped in <function>, <tools>, <tool_call>, and other tags observed from various models
|
||||||
|
patterns = [r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)</\1>"]
|
||||||
|
|
||||||
|
for pattern in patterns:
|
||||||
|
for match in re.finditer(pattern, answer, re.DOTALL):
|
||||||
|
# print(match.group(2))
|
||||||
|
if match.group(2) is None:
|
||||||
|
continue
|
||||||
|
# remove backtick wraps if present
|
||||||
|
candidate = re.sub(r"^```(json|xml|python[^\n]*)\n", "", match.group(2).strip())
|
||||||
|
candidate = re.sub(r"```$", "", candidate.strip())
|
||||||
|
# unwrap inner tags
|
||||||
|
candidate = re.sub(pattern, r"\2", candidate.strip(), flags=re.DOTALL)
|
||||||
|
# llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
|
||||||
|
if re.search(r"\}\s*\n\s*\{", candidate) is not None:
|
||||||
|
candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
|
||||||
|
if not candidate.strip().startswith("["):
|
||||||
|
candidate = "[" + candidate + "]"
|
||||||
|
|
||||||
|
candidates = []
|
||||||
|
try:
|
||||||
|
# parse the candidate JSON into a dictionary
|
||||||
|
candidates = json.loads(candidate)
|
||||||
|
if not isinstance(candidates, list):
|
||||||
|
candidates = [candidates]
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Ignore invalid JSON silently
|
||||||
|
continue
|
||||||
|
|
||||||
|
for candidate_dict in candidates:
|
||||||
|
checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
|
||||||
|
if checked_candidate is not None:
|
||||||
|
matches.append(checked_candidate)
|
||||||
|
|
||||||
|
# last resort if nothing has been mapped: LLM might have produced plain json tool call without xml-like tags
|
||||||
|
if len(matches) == 0:
|
||||||
|
try:
|
||||||
|
candidate = answer
|
||||||
|
# llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
|
||||||
|
if re.search(r"\}\s*\n\s*\{", candidate) is not None:
|
||||||
|
candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
|
||||||
|
if not candidate.strip().startswith("["):
|
||||||
|
candidate = "[" + candidate + "]"
|
||||||
|
# parse the candidate JSON into a dictionary
|
||||||
|
candidates = json.loads(candidate)
|
||||||
|
if not isinstance(candidates, list):
|
||||||
|
candidates = [candidates]
|
||||||
|
for candidate_dict in candidates:
|
||||||
|
checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
|
||||||
|
if checked_candidate is not None:
|
||||||
|
matches.append(checked_candidate)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Ignore invalid JSON silently
|
||||||
|
pass
|
||||||
|
|
||||||
|
return matches
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
import math
|
import math
|
||||||
import random
|
import random
|
||||||
import threading
|
import threading
|
||||||
import torch
|
|
||||||
import chromadb
|
import chromadb
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import posthog
|
import posthog
|
||||||
|
import torch
|
||||||
from chromadb.config import Settings
|
from chromadb.config import Settings
|
||||||
from chromadb.utils import embedding_functions
|
from chromadb.utils import embedding_functions
|
||||||
|
|
||||||
|
@ -292,6 +293,8 @@ class ChromaCollector():
|
||||||
|
|
||||||
for doc in documents:
|
for doc in documents:
|
||||||
doc_tokens = encode(doc)[0]
|
doc_tokens = encode(doc)[0]
|
||||||
|
if isinstance(doc_tokens, np.ndarray):
|
||||||
|
doc_tokens = doc_tokens.tolist()
|
||||||
doc_token_count = len(doc_tokens)
|
doc_token_count = len(doc_tokens)
|
||||||
if current_token_count + doc_token_count > max_token_count:
|
if current_token_count + doc_token_count > max_token_count:
|
||||||
# If adding this document would exceed the max token count,
|
# If adding this document would exceed the max token count,
|
||||||
|
|
18
js/main.js
18
js/main.js
|
@ -150,6 +150,16 @@ const observer = new MutationObserver(function(mutations) {
|
||||||
if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) {
|
if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) {
|
||||||
targetElement.scrollTop = targetElement.scrollHeight;
|
targetElement.scrollTop = targetElement.scrollHeight;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const chatElement = document.getElementById("chat");
|
||||||
|
if (chatElement) {
|
||||||
|
const messagesContainer = chatElement.querySelector(".messages");
|
||||||
|
const lastChild = messagesContainer?.lastElementChild;
|
||||||
|
const prevSibling = lastChild?.previousElementSibling;
|
||||||
|
if (lastChild && prevSibling) {
|
||||||
|
lastChild.style.minHeight = `calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px))`;
|
||||||
|
}
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Configure the observer to watch for changes in the subtree and attributes
|
// Configure the observer to watch for changes in the subtree and attributes
|
||||||
|
@ -442,12 +452,6 @@ function updateCssProperties() {
|
||||||
|
|
||||||
// Check if the chat container is visible
|
// Check if the chat container is visible
|
||||||
if (chatContainer.clientHeight > 0) {
|
if (chatContainer.clientHeight > 0) {
|
||||||
const chatContainerParentHeight = chatContainer.parentNode.clientHeight;
|
|
||||||
const newChatHeight = `${chatContainerParentHeight - chatInputHeight - 80}px`;
|
|
||||||
|
|
||||||
document.documentElement.style.setProperty("--chat-height", newChatHeight);
|
|
||||||
document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`);
|
|
||||||
|
|
||||||
// Adjust scrollTop based on input height change
|
// Adjust scrollTop based on input height change
|
||||||
if (chatInputHeight !== currentChatInputHeight) {
|
if (chatInputHeight !== currentChatInputHeight) {
|
||||||
const deltaHeight = chatInputHeight - currentChatInputHeight;
|
const deltaHeight = chatInputHeight - currentChatInputHeight;
|
||||||
|
@ -720,7 +724,7 @@ function isMobile() {
|
||||||
// Function to initialize sidebars
|
// Function to initialize sidebars
|
||||||
function initializeSidebars() {
|
function initializeSidebars() {
|
||||||
const isOnMobile = isMobile();
|
const isOnMobile = isMobile();
|
||||||
|
|
||||||
if (isOnMobile) {
|
if (isOnMobile) {
|
||||||
// Mobile state: Hide sidebars and set closed states
|
// Mobile state: Hide sidebars and set closed states
|
||||||
[pastChatsRow, chatControlsRow, headerBar].forEach(el => {
|
[pastChatsRow, chatControlsRow, headerBar].forEach(el => {
|
||||||
|
|
|
@ -5,6 +5,7 @@ import html
|
||||||
import json
|
import json
|
||||||
import pprint
|
import pprint
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -145,7 +146,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
|
||||||
instruct_renderer = partial(
|
instruct_renderer = partial(
|
||||||
instruction_template.render,
|
instruction_template.render,
|
||||||
builtin_tools=None,
|
builtin_tools=None,
|
||||||
tools=None,
|
tools=state['tools'] if 'tools' in state else None,
|
||||||
tools_in_user_message=False,
|
tools_in_user_message=False,
|
||||||
add_generation_prompt=False
|
add_generation_prompt=False
|
||||||
)
|
)
|
||||||
|
@ -171,9 +172,13 @@ def generate_chat_prompt(user_input, state, **kwargs):
|
||||||
messages.append({"role": "system", "content": context})
|
messages.append({"role": "system", "content": context})
|
||||||
|
|
||||||
insert_pos = len(messages)
|
insert_pos = len(messages)
|
||||||
for user_msg, assistant_msg in reversed(history):
|
for entry in reversed(history):
|
||||||
user_msg = user_msg.strip()
|
user_msg = entry[0].strip()
|
||||||
assistant_msg = assistant_msg.strip()
|
assistant_msg = entry[1].strip()
|
||||||
|
tool_msg = entry[2].strip() if len(entry) > 2 else ''
|
||||||
|
|
||||||
|
if tool_msg:
|
||||||
|
messages.insert(insert_pos, {"role": "tool", "content": tool_msg})
|
||||||
|
|
||||||
if assistant_msg:
|
if assistant_msg:
|
||||||
messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
|
messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
|
||||||
|
@ -394,16 +399,13 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
|
||||||
|
|
||||||
# Extract the reply
|
# Extract the reply
|
||||||
if state['mode'] in ['chat', 'chat-instruct']:
|
if state['mode'] in ['chat', 'chat-instruct']:
|
||||||
visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply + '▍')
|
visible_reply = re.sub("(<USER>|<user>|{{user}})", state['name1'], reply)
|
||||||
else:
|
else:
|
||||||
visible_reply = reply + '▍'
|
visible_reply = reply
|
||||||
|
|
||||||
visible_reply = html.escape(visible_reply)
|
visible_reply = html.escape(visible_reply)
|
||||||
|
|
||||||
if shared.stop_everything:
|
if shared.stop_everything:
|
||||||
if output['visible'][-1][1].endswith('▍'):
|
|
||||||
output['visible'][-1][1] = output['visible'][-1][1][:-1]
|
|
||||||
|
|
||||||
output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
|
output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
|
||||||
yield output
|
yield output
|
||||||
return
|
return
|
||||||
|
@ -419,9 +421,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
|
||||||
if is_stream:
|
if is_stream:
|
||||||
yield output
|
yield output
|
||||||
|
|
||||||
if output['visible'][-1][1].endswith('▍'):
|
|
||||||
output['visible'][-1][1] = output['visible'][-1][1][:-1]
|
|
||||||
|
|
||||||
output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
|
output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True)
|
||||||
yield output
|
yield output
|
||||||
|
|
||||||
|
@ -481,9 +480,17 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
|
||||||
send_dummy_reply(state['start_with'], state)
|
send_dummy_reply(state['start_with'], state)
|
||||||
|
|
||||||
history = state['history']
|
history = state['history']
|
||||||
|
last_save_time = time.monotonic()
|
||||||
|
save_interval = 8
|
||||||
for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)):
|
for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)):
|
||||||
yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
|
yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
|
||||||
|
|
||||||
|
current_time = time.monotonic()
|
||||||
|
# Save on first iteration or if save_interval seconds have passed
|
||||||
|
if i == 0 or (current_time - last_save_time) >= save_interval:
|
||||||
|
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
|
||||||
|
last_save_time = current_time
|
||||||
|
|
||||||
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
|
save_history(history, state['unique_id'], state['character_menu'], state['mode'])
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -119,7 +119,7 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
|
||||||
reset = True
|
reset = True
|
||||||
|
|
||||||
# Maximum number of tokens to process in a single forward pass
|
# Maximum number of tokens to process in a single forward pass
|
||||||
max_chunk_size = 2048
|
max_chunk_size = 256
|
||||||
|
|
||||||
# Make the forward call
|
# Make the forward call
|
||||||
if labels is None:
|
if labels is None:
|
||||||
|
|
|
@ -66,7 +66,7 @@ class LlamaServer:
|
||||||
"top_k": state["top_k"],
|
"top_k": state["top_k"],
|
||||||
"top_p": state["top_p"],
|
"top_p": state["top_p"],
|
||||||
"min_p": state["min_p"],
|
"min_p": state["min_p"],
|
||||||
"tfs_z": state["tfs"],
|
"top_n_sigma": state["top_n_sigma"] if state["top_n_sigma"] > 0 else -1,
|
||||||
"typical_p": state["typical_p"],
|
"typical_p": state["typical_p"],
|
||||||
"repeat_penalty": state["repetition_penalty"],
|
"repeat_penalty": state["repetition_penalty"],
|
||||||
"repeat_last_n": state["repetition_penalty_range"],
|
"repeat_last_n": state["repetition_penalty_range"],
|
||||||
|
@ -102,8 +102,10 @@ class LlamaServer:
|
||||||
|
|
||||||
penalty_found = False
|
penalty_found = False
|
||||||
for s in samplers:
|
for s in samplers:
|
||||||
if s.strip() in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]:
|
if s.strip() in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc"]:
|
||||||
filtered_samplers.append(s.strip())
|
filtered_samplers.append(s.strip())
|
||||||
|
elif s.strip() == "typical_p":
|
||||||
|
filtered_samplers.append("typ_p")
|
||||||
elif not penalty_found and s.strip() == "repetition_penalty":
|
elif not penalty_found and s.strip() == "repetition_penalty":
|
||||||
filtered_samplers.append("penalties")
|
filtered_samplers.append("penalties")
|
||||||
penalty_found = True
|
penalty_found = True
|
||||||
|
@ -144,8 +146,9 @@ class LlamaServer:
|
||||||
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
|
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# Make a direct request with streaming enabled using a context manager
|
# Make the generation request
|
||||||
with self.session.post(url, json=payload, stream=True) as response:
|
response = self.session.post(url, json=payload, stream=True)
|
||||||
|
try:
|
||||||
response.raise_for_status() # Raise an exception for HTTP errors
|
response.raise_for_status() # Raise an exception for HTTP errors
|
||||||
|
|
||||||
full_text = ""
|
full_text = ""
|
||||||
|
@ -182,6 +185,8 @@ class LlamaServer:
|
||||||
print(f"JSON decode error: {e}")
|
print(f"JSON decode error: {e}")
|
||||||
print(f"Problematic line: {line}")
|
print(f"Problematic line: {line}")
|
||||||
continue
|
continue
|
||||||
|
finally:
|
||||||
|
response.close()
|
||||||
|
|
||||||
def generate(self, prompt, state):
|
def generate(self, prompt, state):
|
||||||
output = ""
|
output = ""
|
||||||
|
@ -210,14 +215,15 @@ class LlamaServer:
|
||||||
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
|
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
response = self.session.post(url, json=payload)
|
for retry in range(5):
|
||||||
result = response.json()
|
response = self.session.post(url, json=payload)
|
||||||
|
result = response.json()
|
||||||
|
|
||||||
if "completion_probabilities" in result:
|
if "completion_probabilities" in result:
|
||||||
if use_samplers:
|
if use_samplers:
|
||||||
return result["completion_probabilities"][0]["top_probs"]
|
return result["completion_probabilities"][0]["top_probs"]
|
||||||
else:
|
else:
|
||||||
return result["completion_probabilities"][0]["top_logprobs"]
|
return result["completion_probabilities"][0]["top_logprobs"]
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
|
raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
|
||||||
|
|
||||||
|
@ -255,9 +261,10 @@ class LlamaServer:
|
||||||
self.server_path,
|
self.server_path,
|
||||||
"--model", self.model_path,
|
"--model", self.model_path,
|
||||||
"--ctx-size", str(shared.args.ctx_size),
|
"--ctx-size", str(shared.args.ctx_size),
|
||||||
"--n-gpu-layers", str(shared.args.n_gpu_layers),
|
"--gpu-layers", str(shared.args.gpu_layers),
|
||||||
"--batch-size", str(shared.args.batch_size),
|
"--batch-size", str(shared.args.batch_size),
|
||||||
"--port", str(self.port),
|
"--port", str(self.port),
|
||||||
|
"--no-webui",
|
||||||
]
|
]
|
||||||
|
|
||||||
if shared.args.flash_attn:
|
if shared.args.flash_attn:
|
||||||
|
@ -278,8 +285,10 @@ class LlamaServer:
|
||||||
cmd.append("--no-kv-offload")
|
cmd.append("--no-kv-offload")
|
||||||
if shared.args.row_split:
|
if shared.args.row_split:
|
||||||
cmd += ["--split-mode", "row"]
|
cmd += ["--split-mode", "row"]
|
||||||
|
cache_type = "fp16"
|
||||||
if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types:
|
if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types:
|
||||||
cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type]
|
cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type]
|
||||||
|
cache_type = shared.args.cache_type
|
||||||
if shared.args.compress_pos_emb != 1:
|
if shared.args.compress_pos_emb != 1:
|
||||||
cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
|
cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
|
||||||
if shared.args.rope_freq_base > 0:
|
if shared.args.rope_freq_base > 0:
|
||||||
|
@ -316,9 +325,15 @@ class LlamaServer:
|
||||||
for flag_item in extra_flags.split(','):
|
for flag_item in extra_flags.split(','):
|
||||||
if '=' in flag_item:
|
if '=' in flag_item:
|
||||||
flag, value = flag_item.split('=', 1)
|
flag, value = flag_item.split('=', 1)
|
||||||
cmd += [f"--{flag}", value]
|
if len(flag) <= 3:
|
||||||
|
cmd += [f"-{flag}", value]
|
||||||
|
else:
|
||||||
|
cmd += [f"--{flag}", value]
|
||||||
else:
|
else:
|
||||||
cmd.append(f"--{flag_item}")
|
if len(flag_item) <= 3:
|
||||||
|
cmd.append(f"-{flag_item}")
|
||||||
|
else:
|
||||||
|
cmd.append(f"--{flag_item}")
|
||||||
|
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
if os.name == 'posix':
|
if os.name == 'posix':
|
||||||
|
@ -333,6 +348,7 @@ class LlamaServer:
|
||||||
print(' '.join(str(item) for item in cmd[1:]))
|
print(' '.join(str(item) for item in cmd[1:]))
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
logger.info(f"Using gpu_layers={shared.args.gpu_layers} | ctx_size={shared.args.ctx_size} | cache_type={cache_type}")
|
||||||
# Start the server with pipes for output
|
# Start the server with pipes for output
|
||||||
self.process = subprocess.Popen(
|
self.process = subprocess.Popen(
|
||||||
cmd,
|
cmd,
|
||||||
|
|
|
@ -5,7 +5,7 @@ import gradio as gr
|
||||||
|
|
||||||
loaders_and_params = OrderedDict({
|
loaders_and_params = OrderedDict({
|
||||||
'llama.cpp': [
|
'llama.cpp': [
|
||||||
'n_gpu_layers',
|
'gpu_layers',
|
||||||
'threads',
|
'threads',
|
||||||
'threads_batch',
|
'threads_batch',
|
||||||
'batch_size',
|
'batch_size',
|
||||||
|
@ -28,6 +28,7 @@ loaders_and_params = OrderedDict({
|
||||||
'device_draft',
|
'device_draft',
|
||||||
'ctx_size_draft',
|
'ctx_size_draft',
|
||||||
'speculative_decoding_accordion',
|
'speculative_decoding_accordion',
|
||||||
|
'vram_info',
|
||||||
],
|
],
|
||||||
'Transformers': [
|
'Transformers': [
|
||||||
'gpu_split',
|
'gpu_split',
|
||||||
|
@ -84,7 +85,6 @@ loaders_and_params = OrderedDict({
|
||||||
'no_flash_attn',
|
'no_flash_attn',
|
||||||
'no_xformers',
|
'no_xformers',
|
||||||
'no_sdpa',
|
'no_sdpa',
|
||||||
'exllamav2_info',
|
|
||||||
'model_draft',
|
'model_draft',
|
||||||
'draft_max',
|
'draft_max',
|
||||||
'ctx_size_draft',
|
'ctx_size_draft',
|
||||||
|
@ -299,7 +299,7 @@ loaders_samplers = {
|
||||||
'typical_p',
|
'typical_p',
|
||||||
'xtc_threshold',
|
'xtc_threshold',
|
||||||
'xtc_probability',
|
'xtc_probability',
|
||||||
'tfs',
|
'top_n_sigma',
|
||||||
'dry_multiplier',
|
'dry_multiplier',
|
||||||
'dry_allowed_length',
|
'dry_allowed_length',
|
||||||
'dry_base',
|
'dry_base',
|
||||||
|
|
|
@ -7,6 +7,7 @@ from modules import models, shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
from modules.models import load_model
|
from modules.models import load_model
|
||||||
from modules.text_generation import generate_reply
|
from modules.text_generation import generate_reply
|
||||||
|
from modules.utils import check_model_loaded
|
||||||
|
|
||||||
global_scores = None
|
global_scores = None
|
||||||
|
|
||||||
|
@ -33,9 +34,9 @@ def get_next_logits(*args, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False):
|
def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False):
|
||||||
if shared.model is None:
|
model_is_loaded, error_message = check_model_loaded()
|
||||||
logger.error("No model is loaded! Select one in the Model tab.")
|
if not model_is_loaded:
|
||||||
return 'Error: No model is loaded1 Select one in the Model tab.', previous
|
return error_message, previous
|
||||||
|
|
||||||
# llama.cpp case
|
# llama.cpp case
|
||||||
if shared.model.__class__.__name__ == 'LlamaServer':
|
if shared.model.__class__.__name__ == 'LlamaServer':
|
||||||
|
|
|
@ -71,7 +71,6 @@ def llama_cpp_server_loader(model_name):
|
||||||
else:
|
else:
|
||||||
model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
|
model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
|
||||||
|
|
||||||
logger.info(f"llama.cpp weights detected: \"{model_file}\"")
|
|
||||||
try:
|
try:
|
||||||
model = LlamaServer(model_file)
|
model = LlamaServer(model_file)
|
||||||
return model, model
|
return model, model
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
|
import functools
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
import subprocess
|
||||||
|
from math import exp
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import gradio as gr
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from modules import chat, loaders, metadata_gguf, shared, ui
|
from modules import chat, loaders, metadata_gguf, shared, ui
|
||||||
|
@ -54,7 +58,7 @@ def get_model_metadata(model):
|
||||||
else:
|
else:
|
||||||
model_file = list(path.glob('*.gguf'))[0]
|
model_file = list(path.glob('*.gguf'))[0]
|
||||||
|
|
||||||
metadata = metadata_gguf.load_metadata(model_file)
|
metadata = load_gguf_metadata_with_cache(model_file)
|
||||||
|
|
||||||
for k in metadata:
|
for k in metadata:
|
||||||
if k.endswith('context_length'):
|
if k.endswith('context_length'):
|
||||||
|
@ -67,7 +71,7 @@ def get_model_metadata(model):
|
||||||
elif k.endswith('rope.scaling.factor'):
|
elif k.endswith('rope.scaling.factor'):
|
||||||
model_settings['compress_pos_emb'] = metadata[k]
|
model_settings['compress_pos_emb'] = metadata[k]
|
||||||
elif k.endswith('block_count'):
|
elif k.endswith('block_count'):
|
||||||
model_settings['n_gpu_layers'] = metadata[k] + 1
|
model_settings['gpu_layers'] = metadata[k] + 1
|
||||||
|
|
||||||
if 'tokenizer.chat_template' in metadata:
|
if 'tokenizer.chat_template' in metadata:
|
||||||
template = metadata['tokenizer.chat_template']
|
template = metadata['tokenizer.chat_template']
|
||||||
|
@ -209,15 +213,27 @@ def apply_model_settings_to_state(model, state):
|
||||||
model_settings = get_model_metadata(model)
|
model_settings = get_model_metadata(model)
|
||||||
if 'loader' in model_settings:
|
if 'loader' in model_settings:
|
||||||
loader = model_settings.pop('loader')
|
loader = model_settings.pop('loader')
|
||||||
|
|
||||||
# If the user is using an alternative loader for the same model type, let them keep using it
|
|
||||||
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
|
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
|
||||||
state['loader'] = loader
|
state['loader'] = loader
|
||||||
|
|
||||||
for k in model_settings:
|
for k in model_settings:
|
||||||
if k in state:
|
if k in state and k != 'gpu_layers': # Skip gpu_layers, handle separately
|
||||||
state[k] = model_settings[k]
|
state[k] = model_settings[k]
|
||||||
|
|
||||||
|
# Handle GPU layers and VRAM update for llama.cpp
|
||||||
|
if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings:
|
||||||
|
vram_info, gpu_layers_update = update_gpu_layers_and_vram(
|
||||||
|
state['loader'],
|
||||||
|
model,
|
||||||
|
model_settings['gpu_layers'],
|
||||||
|
state['ctx_size'],
|
||||||
|
state['cache_type'],
|
||||||
|
auto_adjust=True
|
||||||
|
)
|
||||||
|
|
||||||
|
state['gpu_layers'] = gpu_layers_update
|
||||||
|
state['vram_info'] = vram_info
|
||||||
|
|
||||||
return state
|
return state
|
||||||
|
|
||||||
|
|
||||||
|
@ -277,3 +293,186 @@ def save_instruction_template(model, template):
|
||||||
yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.")
|
yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.")
|
||||||
else:
|
else:
|
||||||
yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")
|
yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")
|
||||||
|
|
||||||
|
|
||||||
|
@functools.lru_cache(maxsize=1)
|
||||||
|
def load_gguf_metadata_with_cache(model_file):
|
||||||
|
return metadata_gguf.load_metadata(model_file)
|
||||||
|
|
||||||
|
|
||||||
|
def get_model_size_mb(model_file: Path) -> float:
|
||||||
|
filename = model_file.name
|
||||||
|
|
||||||
|
# Check for multipart pattern
|
||||||
|
match = re.match(r'(.+)-\d+-of-\d+\.gguf$', filename)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
# It's a multipart file, find all matching parts
|
||||||
|
base_pattern = match.group(1)
|
||||||
|
part_files = sorted(model_file.parent.glob(f'{base_pattern}-*-of-*.gguf'))
|
||||||
|
total_size = sum(p.stat().st_size for p in part_files)
|
||||||
|
else:
|
||||||
|
# Single part
|
||||||
|
total_size = model_file.stat().st_size
|
||||||
|
|
||||||
|
return total_size / (1024 ** 2) # Return size in MB
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
|
||||||
|
model_file = Path(f'{shared.args.model_dir}/{gguf_file}')
|
||||||
|
metadata = load_gguf_metadata_with_cache(model_file)
|
||||||
|
size_in_mb = get_model_size_mb(model_file)
|
||||||
|
|
||||||
|
# Extract values from metadata
|
||||||
|
n_layers = None
|
||||||
|
n_kv_heads = None
|
||||||
|
embedding_dim = None
|
||||||
|
context_length = None
|
||||||
|
feed_forward_dim = None
|
||||||
|
|
||||||
|
for key, value in metadata.items():
|
||||||
|
if key.endswith('.block_count'):
|
||||||
|
n_layers = value
|
||||||
|
elif key.endswith('.attention.head_count_kv'):
|
||||||
|
n_kv_heads = value
|
||||||
|
elif key.endswith('.embedding_length'):
|
||||||
|
embedding_dim = value
|
||||||
|
elif key.endswith('.context_length'):
|
||||||
|
context_length = value
|
||||||
|
elif key.endswith('.feed_forward_length'):
|
||||||
|
feed_forward_dim = value
|
||||||
|
|
||||||
|
if gpu_layers > n_layers:
|
||||||
|
gpu_layers = n_layers
|
||||||
|
|
||||||
|
# Convert cache_type to numeric
|
||||||
|
if cache_type == 'q4_0':
|
||||||
|
cache_type = 4
|
||||||
|
elif cache_type == 'q8_0':
|
||||||
|
cache_type = 8
|
||||||
|
else:
|
||||||
|
cache_type = 16
|
||||||
|
|
||||||
|
# Derived features
|
||||||
|
size_per_layer = size_in_mb / max(n_layers, 1e-6)
|
||||||
|
context_per_layer = context_length / max(n_layers, 1e-6)
|
||||||
|
ffn_per_embedding = feed_forward_dim / max(embedding_dim, 1e-6)
|
||||||
|
kv_cache_factor = n_kv_heads * cache_type * ctx_size
|
||||||
|
|
||||||
|
# Helper function for smaller
|
||||||
|
def smaller(x, y):
|
||||||
|
return 1 if x < y else 0
|
||||||
|
|
||||||
|
# Calculate VRAM using the model
|
||||||
|
# Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
|
||||||
|
vram = (
|
||||||
|
(size_per_layer - 21.19195204848197)
|
||||||
|
* exp(0.0001047328491557063 * size_in_mb * smaller(ffn_per_embedding, 2.671096993407845))
|
||||||
|
+ 0.0006621544775632052 * context_per_layer
|
||||||
|
+ 3.34664386576376e-05 * kv_cache_factor
|
||||||
|
) * (1.363306170123392 + gpu_layers) + 1255.163594536052
|
||||||
|
|
||||||
|
return vram
|
||||||
|
|
||||||
|
|
||||||
|
def get_nvidia_free_vram():
|
||||||
|
"""
|
||||||
|
Calculates the total free VRAM across all NVIDIA GPUs by parsing nvidia-smi output.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
int: The total free VRAM in MiB summed across all detected NVIDIA GPUs.
|
||||||
|
Returns -1 if nvidia-smi command fails (not found, error, etc.).
|
||||||
|
Returns 0 if nvidia-smi succeeds but no GPU memory info found.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Execute nvidia-smi command
|
||||||
|
result = subprocess.run(
|
||||||
|
['nvidia-smi'],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
check=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if nvidia-smi returned an error
|
||||||
|
if result.returncode != 0:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
# Parse the output for memory usage patterns
|
||||||
|
output = result.stdout
|
||||||
|
|
||||||
|
# Find memory usage like "XXXXMiB / YYYYMiB"
|
||||||
|
# Captures used and total memory for each GPU
|
||||||
|
matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output)
|
||||||
|
|
||||||
|
if not matches:
|
||||||
|
# No GPUs found in expected format
|
||||||
|
return 0
|
||||||
|
|
||||||
|
total_free_vram_mib = 0
|
||||||
|
for used_mem_str, total_mem_str in matches:
|
||||||
|
try:
|
||||||
|
used_mib = int(used_mem_str)
|
||||||
|
total_mib = int(total_mem_str)
|
||||||
|
total_free_vram_mib += (total_mib - used_mib)
|
||||||
|
except ValueError:
|
||||||
|
# Skip malformed entries
|
||||||
|
pass
|
||||||
|
|
||||||
|
return total_free_vram_mib
|
||||||
|
|
||||||
|
except FileNotFoundError:
|
||||||
|
# nvidia-smi not found (likely no NVIDIA drivers installed)
|
||||||
|
return -1
|
||||||
|
except Exception:
|
||||||
|
# Handle any other unexpected exceptions
|
||||||
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True):
|
||||||
|
"""
|
||||||
|
Unified function to handle GPU layers and VRAM updates.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
for_ui: If True, returns Gradio updates. If False, returns raw values.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update
|
||||||
|
- If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage
|
||||||
|
"""
|
||||||
|
if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"):
|
||||||
|
vram_info = "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
|
||||||
|
if for_ui:
|
||||||
|
return (vram_info, gr.update()) if auto_adjust else vram_info
|
||||||
|
else:
|
||||||
|
return (0, gpu_layers) if auto_adjust else 0
|
||||||
|
|
||||||
|
current_layers = gpu_layers
|
||||||
|
max_layers = gpu_layers
|
||||||
|
|
||||||
|
if auto_adjust:
|
||||||
|
# Get max layers from model metadata
|
||||||
|
model_settings = get_model_metadata(model)
|
||||||
|
max_layers = model_settings.get('gpu_layers', gpu_layers)
|
||||||
|
|
||||||
|
# Auto-adjust based on available VRAM
|
||||||
|
available_vram = get_nvidia_free_vram()
|
||||||
|
if available_vram > 0:
|
||||||
|
tolerance = 906
|
||||||
|
current_layers = max_layers
|
||||||
|
while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
|
||||||
|
current_layers -= 1
|
||||||
|
|
||||||
|
# Calculate VRAM with current layers
|
||||||
|
vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type)
|
||||||
|
|
||||||
|
if for_ui:
|
||||||
|
vram_info = f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{vram_usage:.0f} MiB</span>"
|
||||||
|
if auto_adjust:
|
||||||
|
return vram_info, gr.update(value=current_layers, maximum=max_layers)
|
||||||
|
else:
|
||||||
|
return vram_info
|
||||||
|
else:
|
||||||
|
if auto_adjust:
|
||||||
|
return vram_usage, current_layers
|
||||||
|
else:
|
||||||
|
return vram_usage
|
||||||
|
|
|
@ -11,7 +11,7 @@ from modules.logging_colors import logger
|
||||||
|
|
||||||
|
|
||||||
def default_preset():
|
def default_preset():
|
||||||
return {
|
result = {
|
||||||
'temperature': 1,
|
'temperature': 1,
|
||||||
'dynatemp_low': 1,
|
'dynatemp_low': 1,
|
||||||
'dynatemp_high': 1,
|
'dynatemp_high': 1,
|
||||||
|
@ -46,10 +46,17 @@ def default_preset():
|
||||||
'do_sample': True,
|
'do_sample': True,
|
||||||
'dynamic_temperature': False,
|
'dynamic_temperature': False,
|
||||||
'temperature_last': False,
|
'temperature_last': False,
|
||||||
'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_n_sigma\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
|
'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntop_n_sigma\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
|
||||||
'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
|
'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if shared.args.portable:
|
||||||
|
samplers = result['sampler_priority'].split('\n')
|
||||||
|
samplers = [sampler for sampler in samplers if sampler in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc", "typical_p", "repetition_penalty"]]
|
||||||
|
result['sampler_priority'] = '\n'.join(samplers)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def presets_params():
|
def presets_params():
|
||||||
return [k for k in default_preset()]
|
return [k for k in default_preset()]
|
||||||
|
|
|
@ -47,7 +47,6 @@ settings = {
|
||||||
'max_new_tokens_max': 4096,
|
'max_new_tokens_max': 4096,
|
||||||
'prompt_lookup_num_tokens': 0,
|
'prompt_lookup_num_tokens': 0,
|
||||||
'max_tokens_second': 0,
|
'max_tokens_second': 0,
|
||||||
'max_updates_second': 12,
|
|
||||||
'auto_max_new_tokens': True,
|
'auto_max_new_tokens': True,
|
||||||
'ban_eos_token': False,
|
'ban_eos_token': False,
|
||||||
'add_bos_token': True,
|
'add_bos_token': True,
|
||||||
|
@ -60,7 +59,6 @@ settings = {
|
||||||
'custom_stopping_strings': '',
|
'custom_stopping_strings': '',
|
||||||
'custom_token_bans': '',
|
'custom_token_bans': '',
|
||||||
'negative_prompt': '',
|
'negative_prompt': '',
|
||||||
'autoload_model': False,
|
|
||||||
'dark_theme': True,
|
'dark_theme': True,
|
||||||
'default_extensions': [],
|
'default_extensions': [],
|
||||||
'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}",
|
'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' -%}\n {{- '' + message['content'] + '\\n\\n' -}}\n {%- else -%}\n {%- if message['role'] == 'user' -%}\n {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n {%- else -%}\n {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{-'### Response:\\n'-}}\n{%- endif -%}",
|
||||||
|
@ -121,7 +119,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa
|
||||||
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
|
group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
|
||||||
group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
|
group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
|
||||||
group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
|
group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
|
||||||
group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
|
group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
|
||||||
group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
|
group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
|
||||||
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
|
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
|
||||||
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
|
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
|
||||||
|
@ -130,9 +128,9 @@ group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to
|
||||||
group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
|
group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
|
||||||
|
|
||||||
# Cache
|
# Cache
|
||||||
group = parser.add_argument_group('Context and cache management')
|
group = parser.add_argument_group('Context and cache')
|
||||||
group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
|
group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
|
||||||
group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
|
group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
|
||||||
|
|
||||||
# Speculative decoding
|
# Speculative decoding
|
||||||
group = parser.add_argument_group('Speculative decoding')
|
group = parser.add_argument_group('Speculative decoding')
|
||||||
|
@ -161,10 +159,6 @@ group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='B
|
||||||
group = parser.add_argument_group('TensorRT-LLM')
|
group = parser.add_argument_group('TensorRT-LLM')
|
||||||
group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
|
group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
|
||||||
|
|
||||||
# Cache
|
|
||||||
group = parser.add_argument_group('Cache')
|
|
||||||
group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
|
|
||||||
|
|
||||||
# DeepSpeed
|
# DeepSpeed
|
||||||
group = parser.add_argument_group('DeepSpeed')
|
group = parser.add_argument_group('DeepSpeed')
|
||||||
group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
|
group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
|
||||||
|
@ -190,6 +184,7 @@ group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certific
|
||||||
group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None)
|
group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None)
|
||||||
group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy')
|
group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy')
|
||||||
group.add_argument('--old-colors', action='store_true', help='Use the legacy Gradio colors, before the December/2024 update.')
|
group.add_argument('--old-colors', action='store_true', help='Use the legacy Gradio colors, before the December/2024 update.')
|
||||||
|
group.add_argument('--portable', action='store_true', help='Hide features not available in portable mode like training.')
|
||||||
|
|
||||||
# API
|
# API
|
||||||
group = parser.add_argument_group('API')
|
group = parser.add_argument_group('API')
|
||||||
|
@ -311,11 +306,13 @@ if args.api or args.public_api:
|
||||||
add_extension('openai', last=True)
|
add_extension('openai', last=True)
|
||||||
|
|
||||||
# Load model-specific settings
|
# Load model-specific settings
|
||||||
with Path(f'{args.model_dir}/config.yaml') as p:
|
p = Path(f'{args.model_dir}/config.yaml')
|
||||||
if p.exists():
|
if p.exists():
|
||||||
model_config = yaml.safe_load(open(p, 'r').read())
|
model_config = yaml.safe_load(open(p, 'r').read())
|
||||||
else:
|
else:
|
||||||
model_config = {}
|
model_config = {}
|
||||||
|
del p
|
||||||
|
|
||||||
|
|
||||||
# Load custom model-specific settings
|
# Load custom model-specific settings
|
||||||
user_config = load_user_config()
|
user_config = load_user_config()
|
||||||
|
|
|
@ -1,15 +1,15 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
import tensorrt_llm
|
import tensorrt_llm
|
||||||
|
import torch
|
||||||
|
from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
|
||||||
|
|
||||||
from modules import shared
|
from modules import shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
from modules.text_generation import (
|
from modules.text_generation import (
|
||||||
get_max_prompt_length,
|
get_max_prompt_length,
|
||||||
get_reply_from_output_ids
|
get_reply_from_output_ids
|
||||||
)
|
)
|
||||||
from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
|
|
||||||
|
|
||||||
|
|
||||||
class TensorRTLLMModel:
|
class TensorRTLLMModel:
|
||||||
|
|
|
@ -14,6 +14,7 @@ from modules.callbacks import Iteratorize
|
||||||
from modules.extensions import apply_extensions
|
from modules.extensions import apply_extensions
|
||||||
from modules.html_generator import generate_basic_html
|
from modules.html_generator import generate_basic_html
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
|
from modules.utils import check_model_loaded
|
||||||
|
|
||||||
|
|
||||||
def generate_reply(*args, **kwargs):
|
def generate_reply(*args, **kwargs):
|
||||||
|
@ -34,8 +35,8 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
|
||||||
# Find the appropriate generation function
|
# Find the appropriate generation function
|
||||||
generate_func = apply_extensions('custom_generate_reply')
|
generate_func = apply_extensions('custom_generate_reply')
|
||||||
if generate_func is None:
|
if generate_func is None:
|
||||||
if shared.model_name == 'None' or shared.model is None:
|
model_is_loaded, error_message = check_model_loaded()
|
||||||
logger.error("No model is loaded! Select one in the Model tab.")
|
if not model_is_loaded:
|
||||||
yield ''
|
yield ''
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -64,41 +65,39 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
|
||||||
all_stop_strings += st
|
all_stop_strings += st
|
||||||
|
|
||||||
shared.stop_everything = False
|
shared.stop_everything = False
|
||||||
last_update = -1
|
|
||||||
reply = ''
|
reply = ''
|
||||||
is_stream = state['stream']
|
is_stream = state['stream']
|
||||||
if len(all_stop_strings) > 0 and not state['stream']:
|
if len(all_stop_strings) > 0 and not state['stream']:
|
||||||
state = copy.deepcopy(state)
|
state = copy.deepcopy(state)
|
||||||
state['stream'] = True
|
state['stream'] = True
|
||||||
|
|
||||||
min_update_interval = 0
|
|
||||||
if state.get('max_updates_second', 0) > 0:
|
|
||||||
min_update_interval = 1 / state['max_updates_second']
|
|
||||||
|
|
||||||
# Generate
|
# Generate
|
||||||
|
last_update = -1
|
||||||
|
latency_threshold = 1 / 1000
|
||||||
for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat):
|
for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat):
|
||||||
|
cur_time = time.monotonic()
|
||||||
reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
|
reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
|
||||||
if escape_html:
|
if escape_html:
|
||||||
reply = html.escape(reply)
|
reply = html.escape(reply)
|
||||||
|
|
||||||
if is_stream:
|
if is_stream:
|
||||||
cur_time = time.time()
|
|
||||||
|
|
||||||
# Limit number of tokens/second to make text readable in real time
|
# Limit number of tokens/second to make text readable in real time
|
||||||
if state['max_tokens_second'] > 0:
|
if state['max_tokens_second'] > 0:
|
||||||
diff = 1 / state['max_tokens_second'] - (cur_time - last_update)
|
diff = 1 / state['max_tokens_second'] - (cur_time - last_update)
|
||||||
if diff > 0:
|
if diff > 0:
|
||||||
time.sleep(diff)
|
time.sleep(diff)
|
||||||
|
|
||||||
last_update = time.time()
|
last_update = time.monotonic()
|
||||||
yield reply
|
yield reply
|
||||||
|
|
||||||
# Limit updates to avoid lag in the Gradio UI
|
# Limit updates to avoid lag in the Gradio UI
|
||||||
# API updates are not limited
|
# API updates are not limited
|
||||||
else:
|
else:
|
||||||
if cur_time - last_update > min_update_interval:
|
# If 'generate_func' takes less than 0.001 seconds to yield the next token
|
||||||
last_update = cur_time
|
# (equivalent to more than 1000 tok/s), assume that the UI is lagging behind and skip yielding
|
||||||
|
if (cur_time - last_update) > latency_threshold:
|
||||||
yield reply
|
yield reply
|
||||||
|
last_update = time.monotonic()
|
||||||
|
|
||||||
if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything):
|
if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything):
|
||||||
break
|
break
|
||||||
|
@ -471,7 +470,7 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
original_tokens = len(original_input_ids[0])
|
original_tokens = len(original_input_ids[0])
|
||||||
new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
|
new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
|
||||||
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
|
logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@ -480,7 +479,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
|
||||||
For models that do not use the transformers library for sampling
|
For models that do not use the transformers library for sampling
|
||||||
"""
|
"""
|
||||||
|
|
||||||
seed = set_manual_seed(state['seed'])
|
state['seed'] = set_manual_seed(state['seed'])
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
reply = ''
|
reply = ''
|
||||||
try:
|
try:
|
||||||
|
@ -500,7 +499,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
original_tokens = len(encode(original_question)[0])
|
original_tokens = len(encode(original_question)[0])
|
||||||
new_tokens = len(encode(original_question + reply)[0]) - original_tokens
|
new_tokens = len(encode(original_question + reply)[0]) - original_tokens
|
||||||
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
|
logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {state["seed"]})')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -61,7 +61,7 @@ if not shared.args.old_colors:
|
||||||
background_fill_primary_dark='var(--darker-gray)',
|
background_fill_primary_dark='var(--darker-gray)',
|
||||||
body_background_fill="white",
|
body_background_fill="white",
|
||||||
block_background_fill="transparent",
|
block_background_fill="transparent",
|
||||||
body_text_color="#333",
|
body_text_color='rgb(64, 64, 64)',
|
||||||
button_secondary_background_fill="#f4f4f4",
|
button_secondary_background_fill="#f4f4f4",
|
||||||
button_secondary_border_color="var(--border-color-primary)",
|
button_secondary_border_color="var(--border-color-primary)",
|
||||||
|
|
||||||
|
@ -105,7 +105,7 @@ def list_model_elements():
|
||||||
'filter_by_loader',
|
'filter_by_loader',
|
||||||
'loader',
|
'loader',
|
||||||
'cpu_memory',
|
'cpu_memory',
|
||||||
'n_gpu_layers',
|
'gpu_layers',
|
||||||
'threads',
|
'threads',
|
||||||
'threads_batch',
|
'threads_batch',
|
||||||
'batch_size',
|
'batch_size',
|
||||||
|
@ -192,7 +192,6 @@ def list_interface_input_elements():
|
||||||
'max_new_tokens',
|
'max_new_tokens',
|
||||||
'prompt_lookup_num_tokens',
|
'prompt_lookup_num_tokens',
|
||||||
'max_tokens_second',
|
'max_tokens_second',
|
||||||
'max_updates_second',
|
|
||||||
'do_sample',
|
'do_sample',
|
||||||
'dynamic_temperature',
|
'dynamic_temperature',
|
||||||
'temperature_last',
|
'temperature_last',
|
||||||
|
|
|
@ -46,8 +46,8 @@ def create_ui():
|
||||||
|
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column(elem_id='chat-col'):
|
with gr.Column(elem_id='chat-col'):
|
||||||
shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
|
|
||||||
shared.gradio['display'] = gr.JSON(value={}, visible=False) # Hidden buffer
|
shared.gradio['display'] = gr.JSON(value={}, visible=False) # Hidden buffer
|
||||||
|
shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
|
||||||
with gr.Row(elem_id="chat-input-row"):
|
with gr.Row(elem_id="chat-input-row"):
|
||||||
with gr.Column(scale=1, elem_id='gr-hover-container'):
|
with gr.Column(scale=1, elem_id='gr-hover-container'):
|
||||||
gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">☰</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')
|
gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">☰</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')
|
||||||
|
|
|
@ -14,6 +14,7 @@ from modules.models_settings import (
|
||||||
get_model_metadata,
|
get_model_metadata,
|
||||||
save_instruction_template,
|
save_instruction_template,
|
||||||
save_model_settings,
|
save_model_settings,
|
||||||
|
update_gpu_layers_and_vram,
|
||||||
update_model_parameters
|
update_model_parameters
|
||||||
)
|
)
|
||||||
from modules.utils import gradio
|
from modules.utils import gradio
|
||||||
|
@ -26,71 +27,36 @@ def create_ui():
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column():
|
shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
|
||||||
with gr.Row():
|
ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
|
||||||
shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
|
shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu)
|
||||||
ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
|
shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
|
||||||
shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button', interactive=not mu)
|
shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
|
||||||
shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
|
|
||||||
shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
|
|
||||||
|
|
||||||
with gr.Column():
|
shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None)
|
||||||
with gr.Row():
|
|
||||||
shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
|
|
||||||
ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
|
|
||||||
shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
|
|
||||||
|
|
||||||
with gr.Row():
|
|
||||||
with gr.Column():
|
|
||||||
shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None)
|
|
||||||
with gr.Blocks():
|
with gr.Blocks():
|
||||||
|
gr.Markdown("## Main options")
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
|
shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
|
||||||
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
|
shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.')
|
||||||
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
|
|
||||||
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
|
|
||||||
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
|
|
||||||
shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
|
|
||||||
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
|
|
||||||
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
|
|
||||||
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
|
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
|
||||||
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
|
shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
|
||||||
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
|
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
|
||||||
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
|
|
||||||
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
|
|
||||||
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
|
|
||||||
shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
|
|
||||||
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
|
|
||||||
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
|
|
||||||
|
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
|
shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
|
||||||
|
shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
|
||||||
|
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
|
||||||
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
|
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
|
||||||
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
|
shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
|
||||||
shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
|
shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
|
||||||
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
|
|
||||||
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
|
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
|
||||||
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
|
|
||||||
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
|
|
||||||
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
|
|
||||||
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
|
|
||||||
shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
|
|
||||||
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
|
|
||||||
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
|
|
||||||
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
|
|
||||||
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
|
shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
|
||||||
shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
|
|
||||||
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
|
|
||||||
shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
|
shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
|
||||||
shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
|
shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
|
||||||
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
|
|
||||||
shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
|
|
||||||
shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
|
|
||||||
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
|
|
||||||
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
|
shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
|
||||||
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
|
shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
|
||||||
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
|
|
||||||
shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
|
|
||||||
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
|
shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
|
||||||
|
|
||||||
# Speculative decoding
|
# Speculative decoding
|
||||||
|
@ -99,15 +65,50 @@ def create_ui():
|
||||||
shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu)
|
shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu)
|
||||||
ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
|
ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
|
||||||
|
|
||||||
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')
|
|
||||||
shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
|
shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
|
||||||
|
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding. Recommended value: 4.')
|
||||||
shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
|
shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
|
||||||
shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
|
shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
|
||||||
|
|
||||||
with gr.Column():
|
gr.Markdown("## Other options")
|
||||||
with gr.Row():
|
with gr.Accordion("See more options", open=False, elem_classes='tgw-accordion'):
|
||||||
shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu)
|
with gr.Row():
|
||||||
|
with gr.Column():
|
||||||
|
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
|
||||||
|
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
|
||||||
|
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
|
||||||
|
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
|
||||||
|
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
|
||||||
|
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
|
||||||
|
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
|
||||||
|
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
|
||||||
|
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
|
||||||
|
shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
|
||||||
|
shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
|
||||||
|
shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
|
||||||
|
|
||||||
|
with gr.Column():
|
||||||
|
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
|
||||||
|
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
|
||||||
|
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
|
||||||
|
shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
|
||||||
|
shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
|
||||||
|
shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
|
||||||
|
shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
|
||||||
|
shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
|
||||||
|
shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
|
||||||
|
shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
|
||||||
|
shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
|
||||||
|
shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
|
||||||
|
shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
|
||||||
|
shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
|
||||||
|
if not shared.args.portable:
|
||||||
|
with gr.Row():
|
||||||
|
shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
|
||||||
|
ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
|
||||||
|
shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
|
||||||
|
|
||||||
|
with gr.Column():
|
||||||
with gr.Tab("Download"):
|
with gr.Tab("Download"):
|
||||||
shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu)
|
shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu)
|
||||||
shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1, interactive=not mu)
|
shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1, interactive=not mu)
|
||||||
|
@ -132,11 +133,10 @@ def create_event_handlers():
|
||||||
|
|
||||||
# In this event handler, the interface state is read and updated
|
# In this event handler, the interface state is read and updated
|
||||||
# with the model defaults (if any), and then the model is loaded
|
# with the model defaults (if any), and then the model is loaded
|
||||||
# unless "autoload_model" is unchecked
|
|
||||||
shared.gradio['model_menu'].change(
|
shared.gradio['model_menu'].change(
|
||||||
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
|
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
|
||||||
handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then(
|
handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then(
|
||||||
load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=True).success(
|
partial(load_model_wrapper, autoload=False), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
|
||||||
handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
|
handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
|
||||||
|
|
||||||
shared.gradio['load_model'].click(
|
shared.gradio['load_model'].click(
|
||||||
|
@ -145,15 +145,31 @@ def create_event_handlers():
|
||||||
partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
|
partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
|
||||||
handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
|
handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
|
||||||
|
|
||||||
shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False)
|
shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False).then(
|
||||||
|
partial(update_gpu_layers_and_vram, auto_adjust=True), gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info', 'gpu_layers'), show_progress=False)
|
||||||
|
|
||||||
shared.gradio['save_model_settings'].click(
|
shared.gradio['save_model_settings'].click(
|
||||||
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
|
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
|
||||||
save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
|
save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
|
||||||
|
|
||||||
shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
|
# For ctx_size and cache_type - auto-adjust GPU layers
|
||||||
|
for param in ['ctx_size', 'cache_type']:
|
||||||
|
shared.gradio[param].change(
|
||||||
|
partial(update_gpu_layers_and_vram, auto_adjust=True),
|
||||||
|
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
|
||||||
|
gradio('vram_info', 'gpu_layers'), show_progress=False)
|
||||||
|
|
||||||
|
# For manual gpu_layers changes - only update VRAM
|
||||||
|
shared.gradio['gpu_layers'].change(
|
||||||
|
partial(update_gpu_layers_and_vram, auto_adjust=False),
|
||||||
|
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
|
||||||
|
gradio('vram_info'), show_progress=False)
|
||||||
|
|
||||||
|
if not shared.args.portable:
|
||||||
|
shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
|
||||||
|
|
||||||
shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
|
shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
|
||||||
shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
|
shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
|
||||||
shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))
|
|
||||||
shared.gradio['customized_template_submit'].click(save_instruction_template, gradio('model_menu', 'customized_template'), gradio('model_status'), show_progress=True)
|
shared.gradio['customized_template_submit'].click(save_instruction_template, gradio('model_menu', 'customized_template'), gradio('model_status'), show_progress=True)
|
||||||
|
|
||||||
|
|
||||||
|
@ -192,6 +208,26 @@ def load_lora_wrapper(selected_loras):
|
||||||
|
|
||||||
def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):
|
def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):
|
||||||
try:
|
try:
|
||||||
|
# Handle direct GGUF URLs
|
||||||
|
if repo_id.startswith("https://") and ("huggingface.co" in repo_id) and (repo_id.endswith(".gguf") or repo_id.endswith(".gguf?download=true")):
|
||||||
|
try:
|
||||||
|
path = repo_id.split("huggingface.co/")[1]
|
||||||
|
|
||||||
|
# Extract the repository ID (first two parts of the path)
|
||||||
|
parts = path.split("/")
|
||||||
|
if len(parts) >= 2:
|
||||||
|
extracted_repo_id = f"{parts[0]}/{parts[1]}"
|
||||||
|
|
||||||
|
# Extract the filename (last part of the path)
|
||||||
|
filename = repo_id.split("/")[-1]
|
||||||
|
if "?download=true" in filename:
|
||||||
|
filename = filename.replace("?download=true", "")
|
||||||
|
|
||||||
|
repo_id = extracted_repo_id
|
||||||
|
specific_file = filename
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
if repo_id == "":
|
if repo_id == "":
|
||||||
yield ("Please enter a model path")
|
yield ("Please enter a model path")
|
||||||
return
|
return
|
||||||
|
@ -205,6 +241,18 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
|
||||||
|
|
||||||
yield ("Getting the download links from Hugging Face")
|
yield ("Getting the download links from Hugging Face")
|
||||||
links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file)
|
links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file)
|
||||||
|
|
||||||
|
# Check for multiple GGUF files
|
||||||
|
gguf_files = [link for link in links if link.lower().endswith('.gguf')]
|
||||||
|
if len(gguf_files) > 1 and not specific_file:
|
||||||
|
output = "Multiple GGUF files found. Please copy one of the following filenames to the 'File name' field:\n\n```\n"
|
||||||
|
for link in gguf_files:
|
||||||
|
output += f"{Path(link).name}\n"
|
||||||
|
|
||||||
|
output += "```"
|
||||||
|
yield output
|
||||||
|
return
|
||||||
|
|
||||||
if return_links:
|
if return_links:
|
||||||
output = "```\n"
|
output = "```\n"
|
||||||
for link in links:
|
for link in links:
|
||||||
|
@ -252,10 +300,34 @@ def update_truncation_length(current_length, state):
|
||||||
return current_length
|
return current_length
|
||||||
|
|
||||||
|
|
||||||
|
def get_initial_vram_info():
|
||||||
|
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
|
||||||
|
return update_gpu_layers_and_vram(
|
||||||
|
shared.args.loader,
|
||||||
|
shared.model_name,
|
||||||
|
shared.args.gpu_layers,
|
||||||
|
shared.args.ctx_size,
|
||||||
|
shared.args.cache_type,
|
||||||
|
auto_adjust=False,
|
||||||
|
for_ui=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
|
||||||
|
|
||||||
|
|
||||||
|
def get_initial_gpu_layers_max():
|
||||||
|
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
|
||||||
|
model_settings = get_model_metadata(shared.model_name)
|
||||||
|
return model_settings.get('gpu_layers', 256)
|
||||||
|
|
||||||
|
return 256
|
||||||
|
|
||||||
|
|
||||||
def handle_load_model_event_initial(model, state):
|
def handle_load_model_event_initial(model, state):
|
||||||
state = apply_model_settings_to_state(model, state)
|
state = apply_model_settings_to_state(model, state)
|
||||||
output = ui.apply_interface_values(state)
|
output = ui.apply_interface_values(state)
|
||||||
update_model_parameters(state)
|
update_model_parameters(state) # This updates the command-line flags
|
||||||
|
|
||||||
return output + [state]
|
return output + [state]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ def create_ui(default_preset):
|
||||||
shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button')
|
shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button')
|
||||||
|
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown')
|
shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()) if not shared.args.portable else ['llama.cpp'], value="All", elem_classes='slim-dropdown')
|
||||||
|
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
|
@ -71,8 +71,6 @@ def create_ui(default_preset):
|
||||||
shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label='max_new_tokens', info='⚠️ Setting this too high can cause prompt truncation.')
|
shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label='max_new_tokens', info='⚠️ Setting this too high can cause prompt truncation.')
|
||||||
shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
|
shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.')
|
||||||
shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
|
shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.')
|
||||||
shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.')
|
|
||||||
|
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
|
@ -82,7 +80,7 @@ def create_ui(default_preset):
|
||||||
shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
|
shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
|
||||||
shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
|
shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
|
||||||
shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
|
shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
|
||||||
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='enable_thinking', info='Used by Qwen3 to toggle <think> mode.')
|
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Qwen3 to toggle <think> mode.')
|
||||||
shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
|
shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
|
||||||
shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
|
shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
|
||||||
shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')
|
shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')
|
||||||
|
|
|
@ -23,11 +23,15 @@ def create_ui():
|
||||||
shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table')
|
shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table')
|
||||||
|
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️ WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu)
|
if not shared.args.portable:
|
||||||
extension_status = gr.Markdown()
|
extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️ WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu)
|
||||||
|
extension_status = gr.Markdown()
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
shared.gradio['theme_state'] = gr.Textbox(visible=False, value='dark' if shared.settings['dark_theme'] else 'light')
|
shared.gradio['theme_state'] = gr.Textbox(visible=False, value='dark' if shared.settings['dark_theme'] else 'light')
|
||||||
extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False)
|
if not shared.args.portable:
|
||||||
|
extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False)
|
||||||
|
|
||||||
# Reset interface event
|
# Reset interface event
|
||||||
shared.gradio['reset_interface'].click(
|
shared.gradio['reset_interface'].click(
|
||||||
|
|
|
@ -72,6 +72,20 @@ def natural_keys(text):
|
||||||
return [atoi(c) for c in re.split(r'(\d+)', text)]
|
return [atoi(c) for c in re.split(r'(\d+)', text)]
|
||||||
|
|
||||||
|
|
||||||
|
def check_model_loaded():
|
||||||
|
if shared.model_name == 'None' or shared.model is None:
|
||||||
|
if len(get_available_models()) <= 1:
|
||||||
|
error_msg = "No model is loaded.\n\nTo get started:\n1) Place a GGUF file in your user_data/models folder\n2) Go to the Model tab and select it"
|
||||||
|
logger.error(error_msg)
|
||||||
|
return False, error_msg
|
||||||
|
else:
|
||||||
|
error_msg = "No model is loaded. Please select one in the Model tab."
|
||||||
|
logger.error(error_msg)
|
||||||
|
return False, error_msg
|
||||||
|
|
||||||
|
return True, None
|
||||||
|
|
||||||
|
|
||||||
def get_available_models():
|
def get_available_models():
|
||||||
# Get all GGUF files
|
# Get all GGUF files
|
||||||
gguf_files = get_available_ggufs()
|
gguf_files = get_available_ggufs()
|
||||||
|
|
|
@ -126,7 +126,7 @@ def check_env():
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Ensure this is a new environment and not the base environment
|
# Ensure this is a new environment and not the base environment
|
||||||
if os.environ["CONDA_DEFAULT_ENV"] == "base":
|
if os.environ.get("CONDA_DEFAULT_ENV", "") == "base":
|
||||||
print("Create an environment for this project and activate it. Exiting...")
|
print("Create an environment for this project and activate it. Exiting...")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
@ -222,7 +222,7 @@ def update_pytorch_and_python():
|
||||||
if "+cu" in torver:
|
if "+cu" in torver:
|
||||||
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
|
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
|
||||||
elif "+rocm" in torver:
|
elif "+rocm" in torver:
|
||||||
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
|
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4"
|
||||||
elif "+cpu" in torver:
|
elif "+cpu" in torver:
|
||||||
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
|
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
|
||||||
elif "+cxx11" in torver:
|
elif "+cxx11" in torver:
|
||||||
|
@ -273,7 +273,7 @@ def install_webui():
|
||||||
"What is your GPU?",
|
"What is your GPU?",
|
||||||
{
|
{
|
||||||
'A': 'NVIDIA - CUDA 12.4',
|
'A': 'NVIDIA - CUDA 12.4',
|
||||||
'B': 'AMD - Linux/macOS only, requires ROCm 6.1',
|
'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4',
|
||||||
'C': 'Apple M Series',
|
'C': 'Apple M Series',
|
||||||
'D': 'Intel Arc (beta)',
|
'D': 'Intel Arc (beta)',
|
||||||
'N': 'CPU mode'
|
'N': 'CPU mode'
|
||||||
|
@ -314,7 +314,7 @@ def install_webui():
|
||||||
if selected_gpu == "NVIDIA":
|
if selected_gpu == "NVIDIA":
|
||||||
install_pytorch += "--index-url https://download.pytorch.org/whl/cu124"
|
install_pytorch += "--index-url https://download.pytorch.org/whl/cu124"
|
||||||
elif selected_gpu == "AMD":
|
elif selected_gpu == "AMD":
|
||||||
install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.1"
|
install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.2.4"
|
||||||
elif selected_gpu in ["APPLE", "NONE"]:
|
elif selected_gpu in ["APPLE", "NONE"]:
|
||||||
install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
|
install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
|
||||||
elif selected_gpu == "INTEL":
|
elif selected_gpu == "INTEL":
|
||||||
|
|
|
@ -30,12 +30,12 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
|
||||||
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
|
|
|
@ -29,6 +29,7 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# AMD wheels
|
# AMD wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||||
|
|
|
@ -29,6 +29,7 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# AMD wheels
|
# AMD wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||||
|
|
|
@ -29,7 +29,7 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# Mac wheels
|
# Mac wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
|
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
|
||||||
|
|
|
@ -29,8 +29,8 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# Mac wheels
|
# Mac wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
|
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
|
||||||
|
|
|
@ -29,5 +29,5 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# llama.cpp (CPU only, AVX2)
|
# llama.cpp (CPU only, AVX2)
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
|
|
|
@ -29,5 +29,5 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# llama.cpp (CPU only, no AVX2)
|
# llama.cpp (CPU only, no AVX2)
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
|
|
|
@ -30,12 +30,12 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
|
https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
|
||||||
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
|
|
|
@ -15,5 +15,5 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
|
|
|
@ -1,18 +0,0 @@
|
||||||
fastapi==0.112.4
|
|
||||||
gradio==4.37.*
|
|
||||||
jinja2==3.1.6
|
|
||||||
markdown
|
|
||||||
numpy==1.26.*
|
|
||||||
pydantic==2.8.2
|
|
||||||
pyyaml
|
|
||||||
requests
|
|
||||||
rich
|
|
||||||
tqdm
|
|
||||||
|
|
||||||
# API
|
|
||||||
flask_cloudflared==0.0.14
|
|
||||||
sse-starlette==1.6.5
|
|
||||||
tiktoken
|
|
||||||
|
|
||||||
# AMD wheels
|
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
|
|
@ -1,18 +0,0 @@
|
||||||
fastapi==0.112.4
|
|
||||||
gradio==4.37.*
|
|
||||||
jinja2==3.1.6
|
|
||||||
markdown
|
|
||||||
numpy==1.26.*
|
|
||||||
pydantic==2.8.2
|
|
||||||
pyyaml
|
|
||||||
requests
|
|
||||||
rich
|
|
||||||
tqdm
|
|
||||||
|
|
||||||
# API
|
|
||||||
flask_cloudflared==0.0.14
|
|
||||||
sse-starlette==1.6.5
|
|
||||||
tiktoken
|
|
||||||
|
|
||||||
# AMD wheels
|
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
|
|
@ -15,5 +15,5 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# Mac wheels
|
# Mac wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||||
|
|
|
@ -15,6 +15,6 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# Mac wheels
|
# Mac wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
|
||||||
|
|
|
@ -15,5 +15,5 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# llama.cpp (CPU only, AVX2)
|
# llama.cpp (CPU only, AVX2)
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
|
|
|
@ -15,5 +15,5 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# llama.cpp (CPU only, no AVX2)
|
# llama.cpp (CPU only, no AVX2)
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
|
|
|
@ -15,5 +15,5 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
|
|
|
@ -15,5 +15,5 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
|
|
|
@ -15,5 +15,5 @@ sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
|
|
20
server.py
20
server.py
|
@ -51,6 +51,7 @@ from modules.models import load_model, unload_model_if_idle
|
||||||
from modules.models_settings import (
|
from modules.models_settings import (
|
||||||
get_fallback_settings,
|
get_fallback_settings,
|
||||||
get_model_metadata,
|
get_model_metadata,
|
||||||
|
update_gpu_layers_and_vram,
|
||||||
update_model_parameters
|
update_model_parameters
|
||||||
)
|
)
|
||||||
from modules.shared import do_cmd_flags_warnings
|
from modules.shared import do_cmd_flags_warnings
|
||||||
|
@ -90,7 +91,7 @@ def create_interface():
|
||||||
'instruction_template_str': shared.settings['instruction_template_str'],
|
'instruction_template_str': shared.settings['instruction_template_str'],
|
||||||
'prompt_menu-default': shared.settings['prompt-default'],
|
'prompt_menu-default': shared.settings['prompt-default'],
|
||||||
'prompt_menu-notebook': shared.settings['prompt-notebook'],
|
'prompt_menu-notebook': shared.settings['prompt-notebook'],
|
||||||
'filter_by_loader': shared.args.loader or 'All'
|
'filter_by_loader': (shared.args.loader or 'All') if not shared.args.portable else 'llama.cpp'
|
||||||
})
|
})
|
||||||
|
|
||||||
if Path("user_data/cache/pfp_character.png").exists():
|
if Path("user_data/cache/pfp_character.png").exists():
|
||||||
|
@ -127,7 +128,8 @@ def create_interface():
|
||||||
|
|
||||||
ui_parameters.create_ui(shared.settings['preset']) # Parameters tab
|
ui_parameters.create_ui(shared.settings['preset']) # Parameters tab
|
||||||
ui_model_menu.create_ui() # Model tab
|
ui_model_menu.create_ui() # Model tab
|
||||||
training.create_ui() # Training tab
|
if not shared.args.portable:
|
||||||
|
training.create_ui() # Training tab
|
||||||
ui_session.create_ui() # Session tab
|
ui_session.create_ui() # Session tab
|
||||||
|
|
||||||
# Generation events
|
# Generation events
|
||||||
|
@ -247,6 +249,20 @@ if __name__ == "__main__":
|
||||||
model_settings = get_model_metadata(model_name)
|
model_settings = get_model_metadata(model_name)
|
||||||
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
|
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
|
||||||
|
|
||||||
|
# Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
|
||||||
|
if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
|
||||||
|
vram_usage, adjusted_layers = update_gpu_layers_and_vram(
|
||||||
|
shared.args.loader,
|
||||||
|
model_name,
|
||||||
|
model_settings['gpu_layers'],
|
||||||
|
shared.args.ctx_size,
|
||||||
|
shared.args.cache_type,
|
||||||
|
auto_adjust=True,
|
||||||
|
for_ui=False
|
||||||
|
)
|
||||||
|
|
||||||
|
shared.args.gpu_layers = adjusted_layers
|
||||||
|
|
||||||
# Load the model
|
# Load the model
|
||||||
shared.model, shared.tokenizer = load_model(model_name)
|
shared.model, shared.tokenizer = load_model(model_name)
|
||||||
if shared.args.lora:
|
if shared.args.lora:
|
||||||
|
|
|
@ -1,10 +1,15 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# environment isolation
|
||||||
|
export PYTHONNOUSERSITE=1
|
||||||
|
unset PYTHONPATH
|
||||||
|
unset PYTHONHOME
|
||||||
|
|
||||||
cd "$(dirname "${BASH_SOURCE[0]}")"
|
cd "$(dirname "${BASH_SOURCE[0]}")"
|
||||||
|
|
||||||
# Portable install case
|
# Portable install case
|
||||||
if [ -d "portable_env" ]; then
|
if [ -d "portable_env" ]; then
|
||||||
./portable_env/bin/python3 server.py --api --auto-launch "$@"
|
./portable_env/bin/python3 server.py --portable --api --auto-launch "$@"
|
||||||
exit $?
|
exit $?
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
|
||||||
exit
|
exit
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# environment isolation
|
|
||||||
export PYTHONNOUSERSITE=1
|
|
||||||
unset PYTHONPATH
|
|
||||||
unset PYTHONHOME
|
|
||||||
export CUDA_PATH="$INSTALL_ENV_DIR"
|
export CUDA_PATH="$INSTALL_ENV_DIR"
|
||||||
export CUDA_HOME="$CUDA_PATH"
|
export CUDA_HOME="$CUDA_PATH"
|
||||||
|
|
||||||
|
|
|
@ -1,10 +1,15 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
# environment isolation
|
||||||
|
export PYTHONNOUSERSITE=1
|
||||||
|
unset PYTHONPATH
|
||||||
|
unset PYTHONHOME
|
||||||
|
|
||||||
cd "$(dirname "${BASH_SOURCE[0]}")"
|
cd "$(dirname "${BASH_SOURCE[0]}")"
|
||||||
|
|
||||||
# Portable install case
|
# Portable install case
|
||||||
if [ -d "portable_env" ]; then
|
if [ -d "portable_env" ]; then
|
||||||
./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@"
|
./portable_env/bin/python3 server.py --portable --api --auto-launch --api-port 5005 "$@"
|
||||||
exit $?
|
exit $?
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
|
||||||
exit
|
exit
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# environment isolation
|
|
||||||
export PYTHONNOUSERSITE=1
|
|
||||||
unset PYTHONPATH
|
|
||||||
unset PYTHONHOME
|
|
||||||
export CUDA_PATH="$INSTALL_ENV_DIR"
|
export CUDA_PATH="$INSTALL_ENV_DIR"
|
||||||
export CUDA_HOME="$CUDA_PATH"
|
export CUDA_HOME="$CUDA_PATH"
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,16 @@
|
||||||
@echo off
|
@echo off
|
||||||
setlocal enabledelayedexpansion
|
setlocal enabledelayedexpansion
|
||||||
|
|
||||||
|
@rem environment isolation
|
||||||
|
set PYTHONNOUSERSITE=1
|
||||||
|
set PYTHONPATH=
|
||||||
|
set PYTHONHOME=
|
||||||
|
|
||||||
cd /D "%~dp0"
|
cd /D "%~dp0"
|
||||||
|
|
||||||
@rem Portable install case
|
@rem Portable install case
|
||||||
if exist "portable_env" (
|
if exist "portable_env" (
|
||||||
.\portable_env\python.exe server.py --api --auto-launch %*
|
.\portable_env\python.exe server.py --portable --api --auto-launch %*
|
||||||
exit /b %errorlevel%
|
exit /b %errorlevel%
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -87,10 +92,6 @@ if not exist "%INSTALL_ENV_DIR%" (
|
||||||
@rem check if conda environment was actually created
|
@rem check if conda environment was actually created
|
||||||
if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
|
if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
|
||||||
|
|
||||||
@rem environment isolation
|
|
||||||
set PYTHONNOUSERSITE=1
|
|
||||||
set PYTHONPATH=
|
|
||||||
set PYTHONHOME=
|
|
||||||
set "CUDA_PATH=%INSTALL_ENV_DIR%"
|
set "CUDA_PATH=%INSTALL_ENV_DIR%"
|
||||||
set "CUDA_HOME=%CUDA_PATH%"
|
set "CUDA_HOME=%CUDA_PATH%"
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,6 @@ max_new_tokens_min: 1
|
||||||
max_new_tokens_max: 4096
|
max_new_tokens_max: 4096
|
||||||
prompt_lookup_num_tokens: 0
|
prompt_lookup_num_tokens: 0
|
||||||
max_tokens_second: 0
|
max_tokens_second: 0
|
||||||
max_updates_second: 12
|
|
||||||
auto_max_new_tokens: true
|
auto_max_new_tokens: true
|
||||||
ban_eos_token: false
|
ban_eos_token: false
|
||||||
add_bos_token: true
|
add_bos_token: true
|
||||||
|
@ -31,7 +30,6 @@ seed: -1
|
||||||
custom_stopping_strings: ''
|
custom_stopping_strings: ''
|
||||||
custom_token_bans: ''
|
custom_token_bans: ''
|
||||||
negative_prompt: ''
|
negative_prompt: ''
|
||||||
autoload_model: false
|
|
||||||
dark_theme: true
|
dark_theme: true
|
||||||
default_extensions: []
|
default_extensions: []
|
||||||
instruction_template_str: |-
|
instruction_template_str: |-
|
||||||
|
|
Loading…
Add table
Reference in a new issue