Merge pull request #6869 from oobabooga/dev

Merge dev branch
This commit is contained in:
oobabooga 2025-04-22 12:09:20 -03:00 committed by GitHub
commit a778270536
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
48 changed files with 1292 additions and 705 deletions

View file

@ -5,8 +5,14 @@
version: 2 version: 2
updates: updates:
- package-ecosystem: "pip" # See documentation for possible values - package-ecosystem: "pip"
directory: "/" # Location of package manifests directory: "/requirements/full/"
target-branch: "dev"
schedule:
interval: "weekly"
- package-ecosystem: "pip"
directory: "/requirements/portable/"
target-branch: "dev" target-branch: "dev"
schedule: schedule:
interval: "weekly" interval: "weekly"

View file

@ -0,0 +1,49 @@
name: Build Everything TGW
on:
workflow_dispatch:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
permissions:
contents: write
jobs:
build_release_cuda_windows:
name: CUDA Windows
uses: ./.github/workflows/build-portable-release-cuda.yml
with:
version: ${{ inputs.version }}
config: 'os:windows-2019'
build_release_cuda_linux:
name: CUDA Linux
uses: ./.github/workflows/build-portable-release-cuda.yml
with:
version: ${{ inputs.version }}
config: 'os:ubuntu-22.04'
build_release_cpu_windows:
name: CPU Windows
uses: ./.github/workflows/build-portable-release.yml
with:
version: ${{ inputs.version }}
config: 'os:windows-2019'
build_release_cpu_linux:
name: CPU Linux
uses: ./.github/workflows/build-portable-release.yml
with:
version: ${{ inputs.version }}
config: 'os:ubuntu-22.04'
build_release_macos:
name: macOS
uses: ./.github/workflows/build-portable-release.yml
with:
version: ${{ inputs.version }}
config: 'os:macos-13,macos-14'

View file

@ -0,0 +1,183 @@
name: Build CUDA
on:
workflow_dispatch:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
config:
description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
default: 'Default'
required: false
type: string
exclude:
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
default: 'None'
required: false
type: string
workflow_call:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
config:
description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
default: 'Default'
required: false
type: string
exclude:
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
default: 'None'
required: false
type: string
permissions:
contents: write
jobs:
define_matrix:
name: Define Build Matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
defaults:
run:
shell: pwsh
env:
CONFIGIN: ${{ inputs.config }}
EXCLUDEIN: ${{ inputs.exclude }}
steps:
- name: Define Job Output
id: set-matrix
run: |
$matrix = @{
'os' = @('ubuntu-22.04', 'windows-2019')
'pyver' = @("3.11")
'avx' = @("AVX2")
'cuda' = @("11.7", "12.4")
}
if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
if ($env:EXCLUDEIN -ne 'None') {
$exclusions = @()
$exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
$matrix['exclude'] = $exclusions
}
$matrixOut = ConvertTo-Json $matrix -Compress
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
build_wheels:
name: ${{ matrix.os }} ${{ matrix.pyver }} CPU ${{ matrix.avx }} CUDA ${{ matrix.cuda }}
needs: define_matrix
runs-on: ${{ matrix.os }}
strategy:
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
defaults:
run:
shell: pwsh
env:
AVXVER: ${{ matrix.avx }}
PCKGVER: ${{ inputs.version }}
steps:
- uses: actions/checkout@v4
with:
repository: 'oobabooga/text-generation-webui'
ref: ${{ inputs.version }}
submodules: 'recursive'
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.pyver }}
- name: Build Package
shell: bash
run: |
rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
# Define common variables
CUDA_VERSION="${{ matrix.cuda }}"
AVX_SUPPORT="${{ matrix.avx }}"
VERSION="${{ inputs.version }}"
# 1. Set platform-specific variables
if [[ "$RUNNER_OS" == "Windows" ]]; then
PLATFORM="windows"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-pc-windows-msvc-install_only.tar.gz"
PIP_PATH="portable_env/python.exe -m pip"
PACKAGES_PATH="portable_env/Lib/site-packages"
ZIP_CMD="powershell -Command \"Compress-Archive -Path text-generation-webui -DestinationPath"
rm start_linux.sh start_macos.sh
else
PLATFORM="linux"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-unknown-linux-gnu-install_only.tar.gz"
PIP_PATH="portable_env/bin/python -m pip"
PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
ZIP_CMD="zip -r"
rm start_macos.sh start_windows.bat
fi
# 2. Download and extract Python
cd ..
echo "Downloading Python for $PLATFORM..."
curl -L -o python-build.tar.gz "$PYTHON_URL"
tar -xzf python-build.tar.gz
mv python text-generation-webui/portable_env
# 3. Prepare requirements file based on AVX and CUDA
if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
BASE_REQ_FILE="requirements/portable/requirements.txt"
else
BASE_REQ_FILE="requirements/portable/requirements_noavx2.txt"
fi
# Create CUDA-specific requirements file if needed
cd text-generation-webui
if [[ "$CUDA_VERSION" == "11.7" ]]; then
echo "Creating CUDA 11.7 specific requirements file"
sed 's/cu124/cu117/g' "$BASE_REQ_FILE" > requirements_cuda_temp.txt
REQ_FILE="requirements_cuda_temp.txt"
else
REQ_FILE="$BASE_REQ_FILE"
fi
# 4. Install packages
echo "Installing Python packages from $REQ_FILE..."
$PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
# 5. Clean up
if [[ "$CUDA_VERSION" == "11.7" ]]; then
rm requirements_cuda_temp.txt
fi
# 6. Create ZIP file
cd ..
VERSION_CLEAN="${VERSION#v}"
ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
echo "Creating archive: $ZIP_NAME"
if [[ "$RUNNER_OS" == "Windows" ]]; then
powershell -Command "Compress-Archive -Path text-generation-webui -DestinationPath $ZIP_NAME"
else
zip -r "$ZIP_NAME" text-generation-webui
fi
- name: Upload files to a GitHub release
id: upload-release
uses: svenstaro/upload-release-action@2.7.0
continue-on-error: true
with:
repo_token: ${{ secrets.GITHUB_TOKEN }}
file: ../textgen-portable-${{ inputs.version }}*.zip
tag: ${{ inputs.version }}
file_glob: true
make_latest: false
overwrite: true

View file

@ -0,0 +1,193 @@
name: Build CPU and macOS
on:
workflow_dispatch:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
config:
description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
default: 'Default'
required: false
type: string
exclude:
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
default: 'None'
required: false
type: string
workflow_call:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
config:
description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
default: 'Default'
required: false
type: string
exclude:
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
default: 'None'
required: false
type: string
permissions:
contents: write
jobs:
define_matrix:
name: Define Build Matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
defaults:
run:
shell: pwsh
env:
CONFIGIN: ${{ inputs.config }}
EXCLUDEIN: ${{ inputs.exclude }}
steps:
- name: Define Job Output
id: set-matrix
run: |
$matrix = @{
'os' = @('ubuntu-22.04', 'windows-2019', 'macos-13', 'macos-14')
'pyver' = @("3.11")
'avx' = @("AVX2")
}
if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
if ($env:EXCLUDEIN -ne 'None') {
$exclusions = @()
$exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
$matrix['exclude'] = $exclusions
}
$matrixOut = ConvertTo-Json $matrix -Compress
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
build_wheels:
name: ${{ matrix.os }} ${{ matrix.pyver }} CPU ${{ matrix.avx }}
needs: define_matrix
runs-on: ${{ matrix.os }}
strategy:
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
defaults:
run:
shell: pwsh
env:
AVXVER: ${{ matrix.avx }}
PCKGVER: ${{ inputs.version }}
steps:
- uses: actions/checkout@v4
with:
repository: 'oobabooga/text-generation-webui'
ref: ${{ inputs.version }}
submodules: 'recursive'
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.pyver }}
- name: Build Package
shell: bash
run: |
rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
# Define common variables
AVX_SUPPORT="${{ matrix.avx }}"
VERSION="${{ inputs.version }}"
OS_TYPE="${{ matrix.os }}"
# 1. Set platform-specific variables
if [[ "$RUNNER_OS" == "Windows" ]]; then
PLATFORM="windows-cpu"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-pc-windows-msvc-install_only.tar.gz"
PIP_PATH="portable_env/python.exe -m pip"
PACKAGES_PATH="portable_env/Lib/site-packages"
rm start_linux.sh start_macos.sh
elif [[ "$RUNNER_OS" == "macOS" ]]; then
if [[ "$OS_TYPE" == "macos-13" ]]; then
PLATFORM="macos-x86_64"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-apple-darwin-install_only.tar.gz"
REQ_TYPE="apple_intel"
else
PLATFORM="macos-arm64"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-aarch64-apple-darwin-install_only.tar.gz"
REQ_TYPE="apple_silicon"
fi
PIP_PATH="portable_env/bin/python -m pip"
PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
rm start_linux.sh start_windows.bat
else
# Linux case
PLATFORM="linux-cpu"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-unknown-linux-gnu-install_only.tar.gz"
PIP_PATH="portable_env/bin/python -m pip"
PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
rm start_macos.sh start_windows.bat
fi
# 2. Download and extract Python
echo "Downloading Python for $PLATFORM..."
cd ..
curl -L -o python-build.tar.gz "$PYTHON_URL"
tar -xzf python-build.tar.gz
mv python text-generation-webui/portable_env
# 3. Prepare requirements file based on platform and AVX
cd text-generation-webui
# Select requirements file based on platform
if [[ "$RUNNER_OS" == "macOS" ]]; then
if [[ "$OS_TYPE" == "macos-13" ]]; then
REQ_FILE="requirements/portable/requirements_apple_intel.txt"
else
REQ_FILE="requirements/portable/requirements_apple_silicon.txt"
fi
else
# For Windows and Linux, check AVX support
if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
REQ_FILE="requirements/portable/requirements_cpu_only.txt"
else
REQ_FILE="requirements/portable/requirements_cpu_only_noavx2.txt"
fi
fi
echo "Using requirements file: $REQ_FILE"
# 4. Install packages
echo "Installing Python packages from $REQ_FILE..."
$PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
# 5. Create ZIP file
cd ..
VERSION_CLEAN="${VERSION#v}"
ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}.zip"
echo "Creating archive: $ZIP_NAME"
if [[ "$RUNNER_OS" == "Windows" ]]; then
powershell -Command "Compress-Archive -Path text-generation-webui -DestinationPath $ZIP_NAME"
else
zip -r "$ZIP_NAME" text-generation-webui
fi
- name: Upload files to a GitHub release
id: upload-release
uses: svenstaro/upload-release-action@2.7.0
continue-on-error: true
with:
repo_token: ${{ secrets.GITHUB_TOKEN }}
file: ../textgen-portable-${{ inputs.version }}*.zip
tag: ${{ inputs.version }}
file_glob: true
make_latest: false
overwrite: true

View file

@ -27,6 +27,14 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
## How to install ## How to install
#### Option 1: Portable builds
Compatible with GGUF (llama.cpp) models, just unzip and run, no installation. Available for Windows, Linux, and macOS.
Download from: https://github.com/oobabooga/text-generation-webui/releases
#### Option 2: One-click installer
1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip). 1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip).
2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat`. 2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat`.
3) Select your GPU vendor when asked. 3) Select your GPU vendor when asked.
@ -352,6 +360,10 @@ Run `python download-model.py --help` to see all the options.
https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/main/Colab-TextGen-GPU.ipynb https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/main/Colab-TextGen-GPU.ipynb
## Community
https://www.reddit.com/r/Oobabooga/
## Acknowledgment ## Acknowledgment
In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition. In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.

View file

@ -7,10 +7,7 @@ from io import BytesIO
import requests import requests
import tiktoken import tiktoken
import torch
import torch.nn.functional as F
from PIL import Image from PIL import Image
from transformers import LogitsProcessor, LogitsProcessorList
from extensions.openai.errors import InvalidRequestError from extensions.openai.errors import InvalidRequestError
from extensions.openai.utils import debug_msg from extensions.openai.utils import debug_msg
@ -22,54 +19,7 @@ from modules.chat import (
load_instruction_template_memoized load_instruction_template_memoized
) )
from modules.presets import load_preset_memoized from modules.presets import load_preset_memoized
from modules.text_generation import ( from modules.text_generation import decode, encode, generate_reply
decode,
encode,
generate_reply,
get_reply_from_output_ids
)
class LogitsBiasProcessor(LogitsProcessor):
def __init__(self, logit_bias={}):
self.logit_bias = logit_bias
if self.logit_bias:
self.keys = list([int(key) for key in self.logit_bias.keys()])
values = [self.logit_bias[str(key)] for key in self.keys]
self.values = torch.tensor(values, dtype=torch.float, device=shared.model.device)
debug_msg(f"{self})")
def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
if self.logit_bias:
debug_msg(logits[0, self.keys], " + ", self.values)
logits[0, self.keys] += self.values
debug_msg(" --> ", logits[0, self.keys])
debug_msg(" max/min ", float(torch.max(logits[0])), float(torch.min(logits[0])))
return logits
def __repr__(self):
return f"<{self.__class__.__name__}(logit_bias={self.logit_bias})>"
class LogprobProcessor(LogitsProcessor):
def __init__(self, logprobs=None):
self.logprobs = logprobs
self.token_alternatives = {}
def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
if self.logprobs is not None: # 0-5
log_e_probabilities = F.log_softmax(logits, dim=1)
top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs + 1)
top_tokens = [get_reply_from_output_ids([tok]) for tok in top_indices[0]]
top_probs = [float(x) for x in top_values[0]]
self.token_alternatives = dict(zip(top_tokens, top_probs))
debug_msg(repr(self))
return logits
def __repr__(self):
return f"<{self.__class__.__name__}(logprobs={self.logprobs}, token_alternatives={self.token_alternatives})>"
def convert_logprobs_to_tiktoken(model, logprobs): def convert_logprobs_to_tiktoken(model, logprobs):
@ -107,21 +57,29 @@ def process_parameters(body, is_legacy=False):
elif isinstance(body['stop'], list): elif isinstance(body['stop'], list):
generate_params['custom_stopping_strings'] = body['stop'] generate_params['custom_stopping_strings'] = body['stop']
logits_processor = [] if shared.args.loader != 'llama.cpp':
logit_bias = body.get('logit_bias', None) from transformers import LogitsProcessorList
if logit_bias: # {str: float, ...}
logits_processor = [LogitsBiasProcessor(logit_bias)]
logprobs = None # coming to chat eventually from modules.transformers_loader import (
if 'logprobs' in body: LogitsBiasProcessor,
logprobs = body.get('logprobs', 0) # maybe cap at topk? don't clamp 0-5. LogprobProcessor
generate_params['logprob_proc'] = LogprobProcessor(logprobs) )
logits_processor.extend([generate_params['logprob_proc']])
else:
logprobs = None
if logits_processor: # requires logits_processor support logits_processor = []
generate_params['logits_processor'] = LogitsProcessorList(logits_processor) logit_bias = body.get('logit_bias', None)
if logit_bias: # {str: float, ...}
logits_processor = [LogitsBiasProcessor(logit_bias)]
logprobs = None # coming to chat eventually
if 'logprobs' in body:
logprobs = body.get('logprobs', 0) # maybe cap at topk? don't clamp 0-5.
generate_params['logprob_proc'] = LogprobProcessor(logprobs)
logits_processor.extend([generate_params['logprob_proc']])
else:
logprobs = None
if logits_processor: # requires logits_processor support
generate_params['logits_processor'] = LogitsProcessorList(logits_processor)
return generate_params return generate_params

View file

@ -6,7 +6,6 @@ import traceback
from collections import deque from collections import deque
from threading import Thread from threading import Thread
import speech_recognition as sr
import uvicorn import uvicorn
from fastapi import Depends, FastAPI, Header, HTTPException from fastapi import Depends, FastAPI, Header, HTTPException
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
@ -16,11 +15,9 @@ from pydub import AudioSegment
from sse_starlette import EventSourceResponse from sse_starlette import EventSourceResponse
import extensions.openai.completions as OAIcompletions import extensions.openai.completions as OAIcompletions
import extensions.openai.embeddings as OAIembeddings
import extensions.openai.images as OAIimages import extensions.openai.images as OAIimages
import extensions.openai.logits as OAIlogits import extensions.openai.logits as OAIlogits
import extensions.openai.models as OAImodels import extensions.openai.models as OAImodels
import extensions.openai.moderations as OAImoderations
from extensions.openai.errors import ServiceUnavailableError from extensions.openai.errors import ServiceUnavailableError
from extensions.openai.tokens import token_count, token_decode, token_encode from extensions.openai.tokens import token_count, token_decode, token_encode
from extensions.openai.utils import _start_cloudflared from extensions.openai.utils import _start_cloudflared
@ -165,6 +162,8 @@ def handle_billing_usage():
@app.post('/v1/audio/transcriptions', dependencies=check_key) @app.post('/v1/audio/transcriptions', dependencies=check_key)
async def handle_audio_transcription(request: Request): async def handle_audio_transcription(request: Request):
import speech_recognition as sr
r = sr.Recognizer() r = sr.Recognizer()
form = await request.form() form = await request.form()
@ -211,6 +210,8 @@ async def handle_image_generation(request: Request):
@app.post("/v1/embeddings", response_model=EmbeddingsResponse, dependencies=check_key) @app.post("/v1/embeddings", response_model=EmbeddingsResponse, dependencies=check_key)
async def handle_embeddings(request: Request, request_data: EmbeddingsRequest): async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
import extensions.openai.embeddings as OAIembeddings
input = request_data.input input = request_data.input
if not input: if not input:
raise HTTPException(status_code=400, detail="Missing required argument input") raise HTTPException(status_code=400, detail="Missing required argument input")
@ -224,6 +225,8 @@ async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
@app.post("/v1/moderations", dependencies=check_key) @app.post("/v1/moderations", dependencies=check_key)
async def handle_moderations(request: Request): async def handle_moderations(request: Request):
import extensions.openai.moderations as OAImoderations
body = await request.json() body = await request.json()
input = body["input"] input = body["input"]
if not input: if not input:

View file

@ -2,7 +2,6 @@ from pathlib import Path
import modules.shared as shared import modules.shared as shared
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.models import get_device
def add_lora_to_model(lora_names): def add_lora_to_model(lora_names):
@ -47,9 +46,10 @@ def add_lora_exllamav2(lora_names):
def add_lora_transformers(lora_names): def add_lora_transformers(lora_names):
from peft import PeftModel from peft import PeftModel
from modules.torch_utils import get_device
prior_set = set(shared.lora_names) prior_set = set(shared.lora_names)
added_set = set(lora_names) - prior_set added_set = set(lora_names) - prior_set
removed_set = prior_set - set(lora_names) removed_set = prior_set - set(lora_names)

View file

@ -2,9 +2,6 @@ import traceback
from queue import Queue from queue import Queue
from threading import Thread from threading import Thread
import torch
import transformers
import modules.shared as shared import modules.shared as shared
@ -12,25 +9,6 @@ class StopNowException(Exception):
pass pass
class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
def __init__(self):
transformers.StoppingCriteria.__init__(self)
def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
return shared.stop_everything
class Stream(transformers.StoppingCriteria):
def __init__(self, callback_func=None):
self.callback_func = callback_func
def __call__(self, input_ids, scores) -> bool:
if self.callback_func is not None:
self.callback_func(input_ids[0])
return False
class Iteratorize: class Iteratorize:
""" """

View file

@ -2,13 +2,11 @@ import datetime
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
import torch
from datasets import load_dataset
from tqdm import tqdm from tqdm import tqdm
from modules import shared from modules import shared
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.models import clear_torch_cache, load_model, unload_model from modules.models import load_model, unload_model
from modules.models_settings import get_model_metadata, update_model_parameters from modules.models_settings import get_model_metadata, update_model_parameters
from modules.text_generation import encode from modules.text_generation import encode
@ -39,6 +37,11 @@ def calculate_perplexity(models, input_dataset, stride, _max_length):
https://huggingface.co/docs/transformers/perplexity#calculating-ppl-with-fixedlength-models https://huggingface.co/docs/transformers/perplexity#calculating-ppl-with-fixedlength-models
''' '''
import torch
from datasets import load_dataset
from modules.torch_utils import clear_torch_cache
if shared.args.loader == "llama.cpp": if shared.args.loader == "llama.cpp":
logger.error("Perplexity evaluation is not implemented for the llama.cpp loader.") logger.error("Perplexity evaluation is not implemented for the llama.cpp loader.")
raise ValueError raise ValueError

View file

@ -4,10 +4,6 @@ from pathlib import Path
from typing import Any, Dict, Optional, Union from typing import Any, Dict, Optional, Union
import torch import torch
from torch.nn import CrossEntropyLoss
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutputWithPast
from exllamav2 import ( from exllamav2 import (
ExLlamaV2, ExLlamaV2,
ExLlamaV2Cache, ExLlamaV2Cache,
@ -18,6 +14,15 @@ from exllamav2 import (
ExLlamaV2Cache_TP, ExLlamaV2Cache_TP,
ExLlamaV2Config ExLlamaV2Config
) )
from torch.nn import CrossEntropyLoss
from transformers import (
GenerationConfig,
GenerationMixin,
PretrainedConfig,
PreTrainedModel
)
from transformers.modeling_outputs import CausalLMOutputWithPast
from modules import shared from modules import shared
from modules.logging_colors import logger from modules.logging_colors import logger
@ -28,7 +33,7 @@ except Exception:
traceback.print_exc() traceback.print_exc()
class Exllamav2HF(PreTrainedModel): class Exllamav2HF(PreTrainedModel, GenerationMixin):
def __init__(self, config: ExLlamaV2Config): def __init__(self, config: ExLlamaV2Config):
super().__init__(PretrainedConfig()) super().__init__(PretrainedConfig())
self.ex_config = config self.ex_config = config

View file

@ -6,7 +6,12 @@ from typing import Any, Dict, Optional, Union
import torch import torch
from exllamav3 import Cache, Config, Model from exllamav3 import Cache, Config, Model
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel from transformers import (
GenerationConfig,
GenerationMixin,
PretrainedConfig,
PreTrainedModel
)
from transformers.modeling_outputs import CausalLMOutputWithPast from transformers.modeling_outputs import CausalLMOutputWithPast
from modules import shared from modules import shared
@ -19,7 +24,7 @@ except Exception:
traceback.print_exc() traceback.print_exc()
class Exllamav3HF(PreTrainedModel): class Exllamav3HF(PreTrainedModel, GenerationMixin):
def __init__(self, model_dir): def __init__(self, model_dir):
super().__init__(PretrainedConfig()) super().__init__(PretrainedConfig())
self.generation_config = GenerationConfig() self.generation_config = GenerationConfig()

View file

@ -1,4 +1,5 @@
import json import json
import os
import pprint import pprint
import socket import socket
import subprocess import subprocess
@ -281,12 +282,21 @@ class LlamaServer:
if shared.args.rope_freq_base > 0: if shared.args.rope_freq_base > 0:
cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)] cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)]
env = os.environ.copy()
if os.name == 'posix':
current_path = env.get('LD_LIBRARY_PATH', '')
if current_path:
env['LD_LIBRARY_PATH'] = f"{current_path}:{os.path.dirname(self.server_path)}"
else:
env['LD_LIBRARY_PATH'] = os.path.dirname(self.server_path)
# Start the server with pipes for output # Start the server with pipes for output
self.process = subprocess.Popen( self.process = subprocess.Popen(
cmd, cmd,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True, text=True,
bufsize=1 bufsize=1,
env=env
) )
def filter_stderr(process_stderr): def filter_stderr(process_stderr):

View file

@ -3,29 +3,7 @@ from collections import OrderedDict
import gradio as gr import gradio as gr
from modules import shared
loaders_and_params = OrderedDict({ loaders_and_params = OrderedDict({
'Transformers': [
'gpu_memory',
'cpu_memory',
'alpha_value',
'compress_pos_emb',
'compute_dtype',
'quant_type',
'load_in_8bit',
'load_in_4bit',
'torch_compile',
'use_flash_attention_2',
'auto_devices',
'cpu',
'disk',
'use_double_quant',
'use_eager_attention',
'bf16',
'trust_remote_code',
'no_use_fast',
],
'llama.cpp': [ 'llama.cpp': [
'n_gpu_layers', 'n_gpu_layers',
'threads', 'threads',
@ -43,6 +21,25 @@ loaders_and_params = OrderedDict({
'mlock', 'mlock',
'numa', 'numa',
], ],
'Transformers': [
'gpu_split',
'cpu_memory',
'alpha_value',
'compress_pos_emb',
'compute_dtype',
'quant_type',
'load_in_8bit',
'load_in_4bit',
'torch_compile',
'use_flash_attention_2',
'cpu',
'disk',
'use_double_quant',
'use_eager_attention',
'bf16',
'trust_remote_code',
'no_use_fast',
],
'ExLlamav3_HF': [ 'ExLlamav3_HF': [
'max_seq_len', 'max_seq_len',
'gpu_split', 'gpu_split',
@ -346,10 +343,6 @@ def blacklist_samplers(loader, dynamic_temperature):
return output return output
def get_gpu_memory_keys():
return [k for k in shared.gradio if k.startswith('gpu_memory')]
@functools.cache @functools.cache
def get_all_params(): def get_all_params():
all_params = set() all_params = set()
@ -357,11 +350,6 @@ def get_all_params():
for el in loaders_and_params[k]: for el in loaders_and_params[k]:
all_params.add(el) all_params.add(el)
if 'gpu_memory' in all_params:
all_params.remove('gpu_memory')
for k in get_gpu_memory_keys():
all_params.add(k)
return sorted(all_params) return sorted(all_params)
@ -371,8 +359,4 @@ def make_loader_params_visible(loader):
if loader in loaders_and_params: if loader in loaders_and_params:
params = loaders_and_params[loader] params = loaders_and_params[loader]
if 'gpu_memory' in params:
params.remove('gpu_memory')
params += get_gpu_memory_keys()
return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params] return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params]

View file

@ -2,11 +2,10 @@ import time
import traceback import traceback
import numpy as np import numpy as np
import torch
from modules import models, sampler_hijack, shared from modules import models, shared
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.models import get_device, load_model from modules.models import load_model
from modules.text_generation import generate_reply from modules.text_generation import generate_reply
global_scores = None global_scores = None
@ -38,18 +37,16 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
logger.error("No model is loaded! Select one in the Model tab.") logger.error("No model is loaded! Select one in the Model tab.")
return 'Error: No model is loaded1 Select one in the Model tab.', previous return 'Error: No model is loaded1 Select one in the Model tab.', previous
is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model' # llama.cpp case
is_llamacpp = shared.model.__class__.__name__ == 'LlamaServer' if shared.model.__class__.__name__ == 'LlamaServer':
if is_llamacpp:
logprobs = shared.model.get_logits(prompt, state, n_probs=top_logits, use_samplers=use_samplers) logprobs = shared.model.get_logits(prompt, state, n_probs=top_logits, use_samplers=use_samplers)
if return_dict: if return_dict:
output = {} output = {}
for entry in logprobs: for entry in logprobs:
token = repr(entry['token']) token = repr(entry['token'])
prob = entry['prob'] if use_samplers else np.exp(entry['logprob']) prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
output[token] = prob output[token] = prob
return output return output
else: else:
output = '' output = ''
@ -57,9 +54,17 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
token = repr(entry['token']) token = repr(entry['token'])
prob = entry['prob'] if use_samplers else np.exp(entry['logprob']) prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
output += f"{prob:.5f} - {token}\n" output += f"{prob:.5f} - {token}\n"
return output, previous return output, previous
# All other model types
else: else:
import torch
from modules import sampler_hijack
from modules.torch_utils import get_device
is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
if not use_samplers: if not use_samplers:
state = {'stream': True} state = {'stream': True}

View file

@ -1,61 +1,11 @@
import gc import sys
import os
import pprint
import re
import time import time
from pathlib import Path from pathlib import Path
import torch
import transformers
from accelerate import infer_auto_device_map, init_empty_weights
from accelerate.utils import (
is_ccl_available,
is_npu_available,
is_xpu_available
)
from transformers import (
AutoConfig,
AutoModel,
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoTokenizer,
BitsAndBytesConfig,
is_torch_npu_available,
is_torch_xpu_available
)
import modules.shared as shared import modules.shared as shared
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.models_settings import get_model_metadata from modules.models_settings import get_model_metadata
transformers.logging.set_verbosity_error()
local_rank = None
if shared.args.deepspeed:
import deepspeed
from transformers.integrations.deepspeed import (
HfDeepSpeedConfig,
is_deepspeed_zero3_enabled
)
from modules.deepspeed_parameters import generate_ds_config
# Distributed setup
local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))
if is_xpu_available() and is_ccl_available():
torch.xpu.set_device(local_rank)
deepspeed.init_distributed(backend="ccl")
elif is_npu_available():
torch.npu.set_device(local_rank)
deepspeed.init_distributed(dist_backend="hccl")
else:
torch.cuda.set_device(local_rank)
deepspeed.init_distributed()
ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration
last_generation_time = time.time() last_generation_time = time.time()
@ -66,8 +16,8 @@ def load_model(model_name, loader=None):
shared.is_seq2seq = False shared.is_seq2seq = False
shared.model_name = model_name shared.model_name = model_name
load_func_map = { load_func_map = {
'Transformers': huggingface_loader,
'llama.cpp': llama_cpp_server_loader, 'llama.cpp': llama_cpp_server_loader,
'Transformers': transformers_loader,
'ExLlamav3_HF': ExLlamav3_HF_loader, 'ExLlamav3_HF': ExLlamav3_HF_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader,
'ExLlamav2': ExLlamav2_loader, 'ExLlamav2': ExLlamav2_loader,
@ -85,8 +35,11 @@ def load_model(model_name, loader=None):
logger.error('The path to the model does not exist. Exiting.') logger.error('The path to the model does not exist. Exiting.')
raise ValueError raise ValueError
if loader != 'llama.cpp' and 'sampler_hijack' not in sys.modules:
from modules import sampler_hijack
sampler_hijack.hijack_samplers()
shared.args.loader = loader shared.args.loader = loader
clear_torch_cache()
output = load_func_map[loader](model_name) output = load_func_map[loader](model_name)
if type(output) is tuple: if type(output) is tuple:
model, tokenizer = output model, tokenizer = output
@ -95,6 +48,7 @@ def load_model(model_name, loader=None):
if model is None: if model is None:
return None, None return None, None
else: else:
from modules.transformers_loader import load_tokenizer
tokenizer = load_tokenizer(model_name) tokenizer = load_tokenizer(model_name)
shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings}) shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
@ -110,163 +64,6 @@ def load_model(model_name, loader=None):
return model, tokenizer return model, tokenizer
def load_tokenizer(model_name, tokenizer_dir=None):
if tokenizer_dir:
path_to_model = Path(tokenizer_dir)
else:
path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
tokenizer = None
if path_to_model.exists():
if shared.args.no_use_fast:
logger.info('Loading the tokenizer with use_fast=False.')
tokenizer = AutoTokenizer.from_pretrained(
path_to_model,
trust_remote_code=shared.args.trust_remote_code,
use_fast=not shared.args.no_use_fast
)
return tokenizer
def huggingface_loader(model_name):
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
params = {
'low_cpu_mem_usage': True,
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
}
if shared.args.trust_remote_code:
params['trust_remote_code'] = True
if shared.args.use_flash_attention_2:
params['use_flash_attention_2'] = True
if shared.args.force_safetensors:
params['force_safetensors'] = True
if shared.args.use_eager_attention:
params['attn_implementation'] = 'eager'
config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
if 'chatglm' in model_name.lower():
LoaderClass = AutoModel
else:
if config.to_dict().get('is_encoder_decoder', False):
LoaderClass = AutoModelForSeq2SeqLM
shared.is_seq2seq = True
else:
LoaderClass = AutoModelForCausalLM
# Determine if we should use default loading
should_use_default_loading = not any([
shared.args.cpu,
shared.args.load_in_8bit,
shared.args.load_in_4bit,
shared.args.auto_devices,
shared.args.disk,
shared.args.deepspeed,
shared.args.gpu_memory is not None,
shared.args.cpu_memory is not None,
shared.args.compress_pos_emb > 1,
shared.args.alpha_value > 1,
])
# Load the model without any special settings
if should_use_default_loading:
logger.info("TRANSFORMERS_PARAMS=")
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
print()
model = LoaderClass.from_pretrained(path_to_model, **params)
if not (hasattr(model, 'is_loaded_in_4bit') and model.is_loaded_in_4bit):
device = get_device()
if device:
model = model.to(device)
# DeepSpeed ZeRO-3
elif shared.args.deepspeed:
model = LoaderClass.from_pretrained(
path_to_model,
torch_dtype=params['torch_dtype'],
trust_remote_code=params.get('trust_remote_code')
)
model = deepspeed.initialize(
model=model,
config_params=ds_config,
model_parameters=None,
optimizer=None,
lr_scheduler=None
)[0]
model.module.eval() # Inference
logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')
# Load with quantization and/or offloading
else:
if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())):
logger.warning('torch.cuda.is_available() and is_xpu_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.')
shared.args.cpu = True
if shared.args.cpu:
params['torch_dtype'] = torch.float32
else:
params['device_map'] = 'auto'
if x := get_max_memory_dict():
params['max_memory'] = x
if shared.args.load_in_4bit:
# See https://github.com/huggingface/transformers/pull/23479/files
# and https://huggingface.co/blog/4bit-transformers-bitsandbytes
quantization_config_params = {
'load_in_4bit': True,
'bnb_4bit_compute_dtype': eval(f"torch.{shared.args.compute_dtype}") if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
'bnb_4bit_quant_type': shared.args.quant_type,
'bnb_4bit_use_double_quant': shared.args.use_double_quant,
'llm_int8_enable_fp32_cpu_offload': True
}
params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)
elif shared.args.load_in_8bit:
if shared.args.auto_devices or shared.args.gpu_memory:
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
else:
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True)
if params.get('max_memory') is not None:
with init_empty_weights():
model = LoaderClass.from_config(config, trust_remote_code=params.get('trust_remote_code'))
model.tie_weights()
params['device_map'] = infer_auto_device_map(
model,
dtype=torch.int8,
max_memory=params.get('max_memory'),
no_split_module_classes=model._no_split_modules
)
if shared.args.disk:
params['offload_folder'] = shared.args.disk_cache_dir
if shared.args.compress_pos_emb > 1:
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
elif shared.args.alpha_value > 1:
params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
logger.info("TRANSFORMERS_PARAMS=")
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
print()
model = LoaderClass.from_pretrained(path_to_model, **params)
if shared.args.torch_compile:
model = torch.compile(model)
return model
def llama_cpp_server_loader(model_name): def llama_cpp_server_loader(model_name):
from modules.llama_cpp_server import LlamaServer from modules.llama_cpp_server import LlamaServer
@ -284,6 +81,11 @@ def llama_cpp_server_loader(model_name):
logger.error(f"Error loading the model with llama.cpp: {str(e)}") logger.error(f"Error loading the model with llama.cpp: {str(e)}")
def transformers_loader(model_name):
from modules.transformers_loader import load_model_HF
return load_model_HF(model_name)
def ExLlamav3_HF_loader(model_name): def ExLlamav3_HF_loader(model_name):
from modules.exllamav3_hf import Exllamav3HF from modules.exllamav3_hf import Exllamav3HF
@ -328,71 +130,18 @@ def TensorRT_LLM_loader(model_name):
return model return model
def get_max_memory_dict():
max_memory = {}
max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
if shared.args.gpu_memory:
memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory))
for i in range(len(memory_map)):
max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
# If --auto-devices is provided standalone, try to get a reasonable value
# for the maximum memory of device :0
elif shared.args.auto_devices:
if is_xpu_available():
total_mem = (torch.xpu.get_device_properties(0).total_memory / (1024 * 1024))
else:
total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
suggestion = round((total_mem - 1000) / 1000) * 1000
if total_mem - suggestion < 800:
suggestion -= 1000
suggestion = int(round(suggestion / 1000))
logger.warning(f"Auto-assiging --gpu-memory {suggestion} for your GPU to try to prevent out-of-memory errors. You can manually set other values.")
max_memory[0] = f'{suggestion}GiB'
max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
return max_memory if len(max_memory) > 0 else None
def get_device():
if torch.cuda.is_available():
return torch.device('cuda')
elif shared.args.deepspeed:
import deepspeed
return deepspeed.get_accelerator().current_device_name()
elif torch.backends.mps.is_available():
return torch.device('mps')
elif is_torch_xpu_available():
return torch.device('xpu:0')
elif is_torch_npu_available():
return torch.device('npu:0')
else:
return None
def clear_torch_cache():
gc.collect()
if not shared.args.cpu:
if torch.cuda.is_available():
torch.cuda.empty_cache()
elif is_xpu_available():
torch.xpu.empty_cache()
elif is_npu_available():
torch.npu.empty_cache()
elif torch.backends.mps.is_available():
if hasattr(torch.backends.mps, 'empty_cache'):
torch.backends.mps.empty_cache()
def unload_model(keep_model_name=False): def unload_model(keep_model_name=False):
if shared.model is None:
return
is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
shared.model = shared.tokenizer = None shared.model = shared.tokenizer = None
shared.lora_names = [] shared.lora_names = []
shared.model_dirty_from_training = False shared.model_dirty_from_training = False
clear_torch_cache() if not is_llamacpp:
from modules.torch_utils import clear_torch_cache
clear_torch_cache()
if not keep_model_name: if not keep_model_name:
shared.model_name = 'None' shared.model_name = 'None'

View file

@ -188,41 +188,20 @@ def update_model_parameters(state, initial=False):
UI: update the command-line arguments based on the interface values UI: update the command-line arguments based on the interface values
''' '''
elements = ui.list_model_elements() # the names of the parameters elements = ui.list_model_elements() # the names of the parameters
gpu_memories = []
for i, element in enumerate(elements): for i, element in enumerate(elements):
if element not in state: if element not in state:
continue continue
value = state[element] value = state[element]
if element.startswith('gpu_memory'):
gpu_memories.append(value)
continue
if initial and element in shared.provided_arguments: if initial and element in shared.provided_arguments:
continue continue
if element in ['cpu_memory'] and value == 0: if element == 'cpu_memory' and value == 0:
value = vars(shared.args_defaults)[element] value = vars(shared.args_defaults)[element]
# Making some simple conversions
if element == 'cpu_memory' and value is not None:
value = f"{value}MiB"
setattr(shared.args, element, value) setattr(shared.args, element, value)
found_positive = False
for i in gpu_memories:
if i > 0:
found_positive = True
break
if not (initial and vars(shared.args)['gpu_memory'] != vars(shared.args_defaults)['gpu_memory']):
if found_positive:
shared.args.gpu_memory = [f"{i}MiB" for i in gpu_memories]
else:
shared.args.gpu_memory = None
def apply_model_settings_to_state(model, state): def apply_model_settings_to_state(model, state):
''' '''

View file

@ -13,7 +13,10 @@ from transformers.generation.logits_process import (
from modules import shared from modules import shared
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.models import get_device from modules.torch_utils import get_device
original_init = transformers.GenerationConfig.__init__
original_get_logits_processor = transformers.GenerationMixin._get_logits_processor
global_scores = None global_scores = None
@ -484,7 +487,7 @@ def get_logits_processor_patch(self, **kwargs):
generation_config.temperature = float(generation_config.temperature) # Must be float generation_config.temperature = float(generation_config.temperature) # Must be float
# Get the original warpers # Get the original warpers
warpers = self._get_logits_processor_old(**kwargs) warpers = original_get_logits_processor(self, **kwargs)
for i in range(len(warpers) - 1, -1, -1): for i in range(len(warpers) - 1, -1, -1):
# Replace temperature with our modified class. # Replace temperature with our modified class.
@ -674,7 +677,7 @@ def get_logits_processor_patch(self, **kwargs):
def generation_config_init_patch(self, **kwargs): def generation_config_init_patch(self, **kwargs):
self.__init___old(**kwargs) original_init(self, **kwargs)
self.min_p = kwargs.pop("min_p", 0.0) self.min_p = kwargs.pop("min_p", 0.0)
self.dynamic_temperature = kwargs.pop("dynamic_temperature", False) self.dynamic_temperature = kwargs.pop("dynamic_temperature", False)
self.dynatemp_low = kwargs.pop("dynatemp_low", 1) self.dynatemp_low = kwargs.pop("dynatemp_low", 1)
@ -702,8 +705,5 @@ def generation_config_init_patch(self, **kwargs):
def hijack_samplers(): def hijack_samplers():
transformers.GenerationMixin._get_logits_processor_old = transformers.GenerationMixin._get_logits_processor
transformers.GenerationMixin._get_logits_processor = get_logits_processor_patch transformers.GenerationMixin._get_logits_processor = get_logits_processor_patch
transformers.GenerationConfig.__init___old = transformers.GenerationConfig.__init__
transformers.GenerationConfig.__init__ = generation_config_init_patch transformers.GenerationConfig.__init__ = generation_config_init_patch

View file

@ -79,6 +79,7 @@ group.add_argument('--model', type=str, help='Name of the model to load by defau
group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.') group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.')
group.add_argument('--model-dir', type=str, default='models/', help='Path to directory with all the models.') group.add_argument('--model-dir', type=str, default='models/', help='Path to directory with all the models.')
group.add_argument('--lora-dir', type=str, default='loras/', help='Path to directory with all the loras.') group.add_argument('--lora-dir', type=str, default='loras/', help='Path to directory with all the loras.')
group.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.')
group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.') group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.') group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.') group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
@ -91,9 +92,7 @@ group.add_argument('--loader', type=str, help='Choose the model loader manually,
# Transformers/Accelerate # Transformers/Accelerate
group = parser.add_argument_group('Transformers/Accelerate') group = parser.add_argument_group('Transformers/Accelerate')
group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.') group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
group.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.') group.add_argument('--cpu-memory', type=float, default=0, help='Maximum CPU memory in GiB. Use this for CPU offloading.')
group.add_argument('--gpu-memory', type=str, nargs='+', help='Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.')
group.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.')
group.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.') group.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
group.add_argument('--disk-cache-dir', type=str, default='cache', help='Directory to save the disk cache to. Defaults to "cache".') group.add_argument('--disk-cache-dir', type=str, default='cache', help='Directory to save the disk cache to. Defaults to "cache".')
group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).') group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).')

View file

@ -7,33 +7,18 @@ import time
import traceback import traceback
import numpy as np import numpy as np
import torch
import transformers
from transformers import (
LogitsProcessorList,
is_torch_npu_available,
is_torch_xpu_available
)
import modules.shared as shared import modules.shared as shared
from modules import models, sampler_hijack from modules import models
from modules.callbacks import ( from modules.callbacks import Iteratorize
Iteratorize,
Stream,
_StopEverythingStoppingCriteria
)
from modules.extensions import apply_extensions from modules.extensions import apply_extensions
from modules.grammar.grammar_utils import initialize_grammar
from modules.grammar.logits_process import GrammarConstrainedLogitsProcessor
from modules.html_generator import generate_basic_html from modules.html_generator import generate_basic_html
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.models import clear_torch_cache, get_device, load_model
sampler_hijack.hijack_samplers()
def generate_reply(*args, **kwargs): def generate_reply(*args, **kwargs):
if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']: if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
from modules.models import load_model
shared.model, shared.tokenizer = load_model(shared.model_name) shared.model, shared.tokenizer = load_model(shared.model_name)
shared.generation_lock.acquire() shared.generation_lock.acquire()
@ -46,7 +31,6 @@ def generate_reply(*args, **kwargs):
def _generate_reply(question, state, stopping_strings=None, is_chat=False, escape_html=False, for_ui=False): def _generate_reply(question, state, stopping_strings=None, is_chat=False, escape_html=False, for_ui=False):
# Find the appropriate generation function # Find the appropriate generation function
generate_func = apply_extensions('custom_generate_reply') generate_func = apply_extensions('custom_generate_reply')
if generate_func is None: if generate_func is None:
@ -80,7 +64,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
all_stop_strings += st all_stop_strings += st
shared.stop_everything = False shared.stop_everything = False
seed = set_manual_seed(state['seed'])
last_update = -1 last_update = -1
reply = '' reply = ''
is_stream = state['stream'] is_stream = state['stream']
@ -93,7 +76,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
min_update_interval = 1 / state['max_updates_second'] min_update_interval = 1 / state['max_updates_second']
# Generate # Generate
for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat): for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat):
reply, stop_found = apply_stopping_strings(reply, all_stop_strings) reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
if escape_html: if escape_html:
reply = html.escape(reply) reply = html.escape(reply)
@ -132,44 +115,55 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if shared.tokenizer is None: if shared.tokenizer is None:
raise ValueError('No tokenizer is loaded') raise ValueError('No tokenizer is loaded')
if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel']: # llama.cpp case
if shared.model.__class__.__name__ == 'LlamaServer': if shared.model.__class__.__name__ == 'LlamaServer':
input_ids = shared.tokenizer.encode(str(prompt), add_bos_token=add_bos_token) input_ids = shared.tokenizer.encode(str(prompt), add_bos_token=add_bos_token)
else: input_ids = np.array(input_ids).reshape(1, len(input_ids))
if truncation_length is not None:
input_ids = input_ids[:, -truncation_length:]
return input_ids
# All other model types
else:
import torch
from modules.torch_utils import get_device
if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel']:
input_ids = shared.tokenizer.encode(str(prompt)) input_ids = shared.tokenizer.encode(str(prompt))
if shared.model.__class__.__name__ != 'Exllamav2Model':
input_ids = np.array(input_ids).reshape(1, len(input_ids))
else:
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
if shared.model.__class__.__name__ not in ['Exllamav2Model']: if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None:
input_ids = np.array(input_ids).reshape(1, len(input_ids)) if add_bos_token:
else: # Add BOS token if missing
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens) if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0:
bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]])
input_ids = torch.cat((bos_tensor, input_ids), 1)
if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None: # Prevent double BOS tokens from jinja templates
if add_bos_token: while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id:
if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0: input_ids = input_ids[:, 1:]
# Add a missing bos token (it may not have been added due to faulty model metadata) else:
bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]]) # Remove BOS tokens when not wanted
input_ids = torch.cat((bos_tensor, input_ids), 1) while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id:
input_ids = input_ids[:, 1:]
# Prevent double bos token due to jinja templates with <s> somewhere if truncation_length is not None:
while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id: input_ids = input_ids[:, -truncation_length:]
input_ids = input_ids[:, 1:]
else:
# Remove any bos token that may have been added
while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id:
input_ids = input_ids[:, 1:]
# Handling truncation if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
if truncation_length is not None: return input_ids
input_ids = input_ids[:, -truncation_length:] else:
device = get_device()
if device:
return input_ids.to(device)
if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu: return input_ids
return input_ids
else:
device = get_device()
if device:
return input_ids.to(device)
return input_ids
def decode(output_ids, skip_special_tokens=True): def decode(output_ids, skip_special_tokens=True):
@ -225,13 +219,17 @@ def set_manual_seed(seed):
if seed == -1: if seed == -1:
seed = random.randint(1, 2**31) seed = random.randint(1, 2**31)
torch.manual_seed(seed) if shared.args.loader != 'llama.cpp':
if torch.cuda.is_available(): import torch
torch.cuda.manual_seed_all(seed) from transformers import is_torch_npu_available, is_torch_xpu_available
elif is_torch_xpu_available():
torch.xpu.manual_seed_all(seed) torch.manual_seed(seed)
elif is_torch_npu_available(): if torch.cuda.is_available():
torch.npu.manual_seed_all(seed) torch.cuda.manual_seed_all(seed)
elif is_torch_xpu_available():
torch.xpu.manual_seed_all(seed)
elif is_torch_npu_available():
torch.npu.manual_seed_all(seed)
return seed return seed
@ -285,10 +283,26 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
return reply return reply
def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False): def generate_reply_HF(question, original_question, state, stopping_strings=None, is_chat=False):
import torch
import transformers
from transformers import LogitsProcessorList
from modules.grammar.grammar_utils import initialize_grammar
from modules.grammar.logits_process import (
GrammarConstrainedLogitsProcessor
)
from modules.torch_utils import clear_torch_cache, get_device
from modules.transformers_loader import (
Stream,
_StopEverythingStoppingCriteria
)
if shared.args.loader == 'Transformers': if shared.args.loader == 'Transformers':
clear_torch_cache() clear_torch_cache()
seed = set_manual_seed(state['seed'])
generate_params = {} generate_params = {}
for k in [ for k in [
'temperature', 'temperature',
@ -458,12 +472,12 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
return return
def generate_reply_custom(question, original_question, seed, state, stopping_strings=None, is_chat=False): def generate_reply_custom(question, original_question, state, stopping_strings=None, is_chat=False):
""" """
For models that do not use the transformers library for sampling For models that do not use the transformers library for sampling
""" """
seed = set_manual_seed(state['seed'])
seed = set_manual_seed(state['seed'])
t0 = time.time() t0 = time.time()
reply = '' reply = ''
try: try:

37
modules/torch_utils.py Normal file
View file

@ -0,0 +1,37 @@
import gc
import torch
from accelerate.utils import is_npu_available, is_xpu_available
from transformers import is_torch_npu_available, is_torch_xpu_available
from modules import shared
def get_device():
if torch.cuda.is_available():
return torch.device('cuda')
elif shared.args.deepspeed:
import deepspeed
return deepspeed.get_accelerator().current_device_name()
elif torch.backends.mps.is_available():
return torch.device('mps')
elif is_torch_xpu_available():
return torch.device('xpu:0')
elif is_torch_npu_available():
return torch.device('npu:0')
else:
return None
def clear_torch_cache():
gc.collect()
if not shared.args.cpu:
if torch.cuda.is_available():
torch.cuda.empty_cache()
elif is_xpu_available():
torch.xpu.empty_cache()
elif is_npu_available():
torch.npu.empty_cache()
elif torch.backends.mps.is_available():
if hasattr(torch.backends.mps, 'empty_cache'):
torch.backends.mps.empty_cache()

View file

@ -15,13 +15,6 @@ from datetime import datetime
from pathlib import Path from pathlib import Path
import gradio as gr import gradio as gr
import torch
import transformers
from datasets import Dataset, load_dataset
from transformers import is_torch_xpu_available
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
)
from modules import shared, ui, utils from modules import shared, ui, utils
from modules.evaluate import ( from modules.evaluate import (
@ -33,7 +26,6 @@ from modules.logging_colors import logger
from modules.models import reload_model from modules.models import reload_model
from modules.utils import natural_keys from modules.utils import natural_keys
MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()}
PARAMETERS = ["lora_name", "always_override", "q_proj_en", "v_proj_en", "k_proj_en", "o_proj_en", "gate_proj_en", "down_proj_en", "up_proj_en", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to"] PARAMETERS = ["lora_name", "always_override", "q_proj_en", "v_proj_en", "k_proj_en", "o_proj_en", "gate_proj_en", "down_proj_en", "up_proj_en", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to"]
WANT_INTERRUPT = False WANT_INTERRUPT = False
@ -284,6 +276,9 @@ def calc_trainable_parameters(model):
def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str): def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str):
import torch
import transformers
from datasets import Dataset, load_dataset
from peft import ( from peft import (
LoraConfig, LoraConfig,
get_peft_model, get_peft_model,
@ -293,6 +288,12 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
from peft.utils.other import \ from peft.utils.other import \
TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as \ TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as \
model_to_lora_modules model_to_lora_modules
from transformers import is_torch_xpu_available
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
)
MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()}
global WANT_INTERRUPT global WANT_INTERRUPT
WANT_INTERRUPT = False WANT_INTERRUPT = False

View file

@ -0,0 +1,279 @@
import os
import pprint
from pathlib import Path
import torch
import torch.nn.functional as F
import transformers
from accelerate import infer_auto_device_map, init_empty_weights
from accelerate.utils import (
is_ccl_available,
is_npu_available,
is_xpu_available
)
from transformers import (
AutoConfig,
AutoModel,
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoTokenizer,
BitsAndBytesConfig,
LogitsProcessor
)
import modules.shared as shared
from modules.logging_colors import logger
from modules.text_generation import get_reply_from_output_ids
from modules.torch_utils import get_device
transformers.logging.set_verbosity_error()
local_rank = None
if shared.args.deepspeed:
import deepspeed
from transformers.integrations.deepspeed import (
HfDeepSpeedConfig,
is_deepspeed_zero3_enabled
)
from modules.deepspeed_parameters import generate_ds_config
# Distributed setup
local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))
if is_xpu_available() and is_ccl_available():
torch.xpu.set_device(local_rank)
deepspeed.init_distributed(backend="ccl")
elif is_npu_available():
torch.npu.set_device(local_rank)
deepspeed.init_distributed(dist_backend="hccl")
else:
torch.cuda.set_device(local_rank)
deepspeed.init_distributed()
ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration
class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
def __init__(self):
transformers.StoppingCriteria.__init__(self)
def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
return shared.stop_everything
class Stream(transformers.StoppingCriteria):
def __init__(self, callback_func=None):
self.callback_func = callback_func
def __call__(self, input_ids, scores) -> bool:
if self.callback_func is not None:
self.callback_func(input_ids[0])
return False
class LogitsBiasProcessor(LogitsProcessor):
def __init__(self, logit_bias={}):
self.logit_bias = logit_bias
if self.logit_bias:
self.keys = list([int(key) for key in self.logit_bias.keys()])
values = [self.logit_bias[str(key)] for key in self.keys]
self.values = torch.tensor(values, dtype=torch.float, device=shared.model.device)
def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
if self.logit_bias:
logits[0, self.keys] += self.values
return logits
def __repr__(self):
return f"<{self.__class__.__name__}(logit_bias={self.logit_bias})>"
class LogprobProcessor(LogitsProcessor):
def __init__(self, logprobs=None):
self.logprobs = logprobs
self.token_alternatives = {}
def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
if self.logprobs is not None: # 0-5
log_e_probabilities = F.log_softmax(logits, dim=1)
top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs + 1)
top_tokens = [get_reply_from_output_ids([tok]) for tok in top_indices[0]]
top_probs = [float(x) for x in top_values[0]]
self.token_alternatives = dict(zip(top_tokens, top_probs))
return logits
def __repr__(self):
return f"<{self.__class__.__name__}(logprobs={self.logprobs}, token_alternatives={self.token_alternatives})>"
def load_tokenizer(model_name, tokenizer_dir=None):
if tokenizer_dir:
path_to_model = Path(tokenizer_dir)
else:
path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
tokenizer = None
if path_to_model.exists():
if shared.args.no_use_fast:
logger.info('Loading the tokenizer with use_fast=False.')
tokenizer = AutoTokenizer.from_pretrained(
path_to_model,
trust_remote_code=shared.args.trust_remote_code,
use_fast=not shared.args.no_use_fast
)
return tokenizer
def load_model_HF(model_name):
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
params = {
'low_cpu_mem_usage': True,
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
}
if shared.args.trust_remote_code:
params['trust_remote_code'] = True
if shared.args.use_flash_attention_2:
params['use_flash_attention_2'] = True
if shared.args.force_safetensors:
params['force_safetensors'] = True
if shared.args.use_eager_attention:
params['attn_implementation'] = 'eager'
config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
if 'chatglm' in model_name.lower():
LoaderClass = AutoModel
else:
if config.to_dict().get('is_encoder_decoder', False):
LoaderClass = AutoModelForSeq2SeqLM
shared.is_seq2seq = True
else:
LoaderClass = AutoModelForCausalLM
# Determine if we should use default loading
should_use_default_loading = not any([
shared.args.cpu,
shared.args.load_in_8bit,
shared.args.load_in_4bit,
shared.args.disk,
shared.args.deepspeed,
shared.args.cpu_memory is not None,
shared.args.compress_pos_emb > 1,
shared.args.alpha_value > 1,
])
# Load the model without any special settings
if should_use_default_loading:
params['device_map'] = 'auto'
logger.info("TRANSFORMERS_PARAMS=")
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
print()
model = LoaderClass.from_pretrained(path_to_model, **params)
if not (hasattr(model, 'is_loaded_in_4bit') and model.is_loaded_in_4bit):
device = get_device()
if device:
model = model.to(device)
# DeepSpeed ZeRO-3
elif shared.args.deepspeed:
model = LoaderClass.from_pretrained(
path_to_model,
torch_dtype=params['torch_dtype'],
trust_remote_code=params.get('trust_remote_code')
)
model = deepspeed.initialize(
model=model,
config_params=ds_config,
model_parameters=None,
optimizer=None,
lr_scheduler=None
)[0]
model.module.eval() # Inference
logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')
# Load with quantization and/or offloading
else:
if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())):
logger.warning('torch.cuda.is_available() and is_xpu_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.')
shared.args.cpu = True
if shared.args.cpu:
params['torch_dtype'] = torch.float32
else:
params['device_map'] = 'auto'
if x := get_max_memory_dict():
params['max_memory'] = x
if shared.args.load_in_4bit:
# See https://github.com/huggingface/transformers/pull/23479/files
# and https://huggingface.co/blog/4bit-transformers-bitsandbytes
quantization_config_params = {
'load_in_4bit': True,
'bnb_4bit_compute_dtype': eval(f"torch.{shared.args.compute_dtype}") if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
'bnb_4bit_quant_type': shared.args.quant_type,
'bnb_4bit_use_double_quant': shared.args.use_double_quant,
'llm_int8_enable_fp32_cpu_offload': True
}
params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)
elif shared.args.load_in_8bit:
if shared.args.gpu_split:
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
else:
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True)
if params.get('max_memory') is not None:
with init_empty_weights():
model = LoaderClass.from_config(config, trust_remote_code=params.get('trust_remote_code'))
model.tie_weights()
params['device_map'] = infer_auto_device_map(
model,
dtype=torch.int8,
max_memory=params.get('max_memory'),
no_split_module_classes=model._no_split_modules
)
if shared.args.disk:
params['offload_folder'] = shared.args.disk_cache_dir
if shared.args.compress_pos_emb > 1:
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
elif shared.args.alpha_value > 1:
params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
logger.info("TRANSFORMERS_PARAMS=")
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
print()
model = LoaderClass.from_pretrained(path_to_model, **params)
if shared.args.torch_compile:
model = torch.compile(model)
return model
def get_max_memory_dict():
max_memory = {}
if shared.args.cpu_memory > 0:
max_memory['cpu'] = f'{shared.args.cpu_memory}GiB'
if shared.args.gpu_split:
for i, memory in enumerate(shared.args.gpu_split.split(',')):
max_memory[i] = f'{memory}GiB'
return max_memory if len(max_memory) > 0 else None

View file

@ -2,9 +2,7 @@ import copy
from pathlib import Path from pathlib import Path
import gradio as gr import gradio as gr
import torch
import yaml import yaml
from transformers import is_torch_xpu_available
import extensions import extensions
from modules import shared from modules import shared
@ -128,7 +126,6 @@ def list_model_elements():
'torch_compile', 'torch_compile',
'flash_attn', 'flash_attn',
'use_flash_attention_2', 'use_flash_attention_2',
'auto_devices',
'cpu', 'cpu',
'disk', 'disk',
'row_split', 'row_split',
@ -150,13 +147,6 @@ def list_model_elements():
'no_use_fast', 'no_use_fast',
] ]
if is_torch_xpu_available():
for i in range(torch.xpu.device_count()):
elements.append(f'gpu_memory_{i}')
else:
for i in range(torch.cuda.device_count()):
elements.append(f'gpu_memory_{i}')
return elements return elements

View file

@ -1,14 +1,9 @@
import importlib import importlib
import math
import re
import traceback import traceback
from functools import partial from functools import partial
from pathlib import Path from pathlib import Path
import gradio as gr import gradio as gr
import psutil
import torch
from transformers import is_torch_npu_available, is_torch_xpu_available
from modules import loaders, shared, ui, utils from modules import loaders, shared, ui, utils
from modules.logging_colors import logger from modules.logging_colors import logger
@ -27,35 +22,6 @@ from modules.utils import gradio
def create_ui(): def create_ui():
mu = shared.args.multi_user mu = shared.args.multi_user
# Finding the default values for the GPU and CPU memories
total_mem = []
if is_torch_xpu_available():
for i in range(torch.xpu.device_count()):
total_mem.append(math.floor(torch.xpu.get_device_properties(i).total_memory / (1024 * 1024)))
elif is_torch_npu_available():
for i in range(torch.npu.device_count()):
total_mem.append(math.floor(torch.npu.get_device_properties(i).total_memory / (1024 * 1024)))
else:
for i in range(torch.cuda.device_count()):
total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))
default_gpu_mem = []
if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
for i in shared.args.gpu_memory:
if 'mib' in i.lower():
default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
else:
default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
while len(default_gpu_mem) < len(total_mem):
default_gpu_mem.append(0)
total_cpu_mem = math.floor(psutil.virtual_memory().total / (1024 * 1024))
if shared.args.cpu_memory is not None:
default_cpu_mem = re.sub('[a-zA-Z ]', '', shared.args.cpu_memory)
else:
default_cpu_mem = 0
with gr.Tab("Model", elem_id="model-tab"): with gr.Tab("Model", elem_id="model-tab"):
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
@ -80,10 +46,6 @@ def create_ui():
with gr.Blocks(): with gr.Blocks():
with gr.Row(): with gr.Row():
with gr.Column(): with gr.Column():
for i in range(len(total_mem)):
shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i])
shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem)
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads) shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
@ -94,6 +56,7 @@ def create_ui():
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.') shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40') shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.') shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.') shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.') shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
@ -107,7 +70,6 @@ def create_ui():
shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.') shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.') shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk) shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.') shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')

View file

@ -15,7 +15,6 @@ import sys
# os.environ["HSA_OVERRIDE_GFX_VERSION"] = '10.3.0' # os.environ["HSA_OVERRIDE_GFX_VERSION"] = '10.3.0'
# os.environ["HCC_AMDGPU_TARGET"] = 'gfx1030' # os.environ["HCC_AMDGPU_TARGET"] = 'gfx1030'
# Define the required versions # Define the required versions
TORCH_VERSION = "2.6.0" TORCH_VERSION = "2.6.0"
TORCHVISION_VERSION = "0.21.0" TORCHVISION_VERSION = "0.21.0"
@ -62,6 +61,19 @@ def is_x86_64():
return platform.machine() == "x86_64" return platform.machine() == "x86_64"
def is_installed():
site_packages_path = None
for sitedir in site.getsitepackages():
if "site-packages" in sitedir and conda_env_path in sitedir:
site_packages_path = sitedir
break
if site_packages_path:
return os.path.isfile(os.path.join(site_packages_path, 'torch', '__init__.py'))
else:
return os.path.isdir(conda_env_path)
def cpu_has_avx2(): def cpu_has_avx2():
try: try:
import cpuinfo import cpuinfo
@ -104,44 +116,13 @@ def torch_version():
return torver return torver
def update_pytorch_and_python(): def get_current_commit():
print_big_message("Checking for PyTorch updates.") result = run_cmd("git rev-parse HEAD", capture_output=True, environment=True)
return result.stdout.decode('utf-8').strip()
# Update the Python version. Left here for future reference in case this becomes necessary.
# print_big_message("Checking for PyTorch and Python updates.")
# current_python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
# if current_python_version != PYTHON_VERSION:
# run_cmd(f"conda install -y python={PYTHON_VERSION}", assert_success=True, environment=True)
torver = torch_version()
base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}"
if "+cu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
elif "+rocm" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
elif "+cpu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
elif "+cxx11" in torver:
intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
install_cmd = f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
else:
install_cmd = base_cmd
run_cmd(install_cmd, assert_success=True, environment=True)
def is_installed(): def get_extensions_names():
site_packages_path = None return [foldername for foldername in os.listdir('extensions') if os.path.isfile(os.path.join('extensions', foldername, 'requirements.txt'))]
for sitedir in site.getsitepackages():
if "site-packages" in sitedir and conda_env_path in sitedir:
site_packages_path = sitedir
break
if site_packages_path:
return os.path.isfile(os.path.join(site_packages_path, 'torch', '__init__.py'))
else:
return os.path.isdir(conda_env_path)
def check_env(): def check_env():
@ -157,35 +138,11 @@ def check_env():
sys.exit(1) sys.exit(1)
def get_current_commit():
result = run_cmd("git rev-parse HEAD", capture_output=True, environment=True)
return result.stdout.decode('utf-8').strip()
def clear_cache(): def clear_cache():
run_cmd("conda clean -a -y", environment=True) run_cmd("conda clean -a -y", environment=True)
run_cmd("python -m pip cache purge", environment=True) run_cmd("python -m pip cache purge", environment=True)
def print_big_message(message):
message = message.strip()
lines = message.split('\n')
print("\n\n*******************************************************************")
for line in lines:
print("*", line)
print("*******************************************************************\n\n")
def calculate_file_hash(file_path):
p = os.path.join(script_dir, file_path)
if os.path.isfile(p):
with open(p, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
else:
return ''
def run_cmd(cmd, assert_success=False, environment=False, capture_output=False, env=None): def run_cmd(cmd, assert_success=False, environment=False, capture_output=False, env=None):
# Use the conda environment # Use the conda environment
if environment: if environment:
@ -210,6 +167,25 @@ def run_cmd(cmd, assert_success=False, environment=False, capture_output=False,
return result return result
def print_big_message(message):
message = message.strip()
lines = message.split('\n')
print("\n\n*******************************************************************")
for line in lines:
print("*", line)
print("*******************************************************************\n\n")
def calculate_file_hash(file_path):
p = os.path.join(script_dir, file_path)
if os.path.isfile(p):
with open(p, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
else:
return ''
def generate_alphabetic_sequence(index): def generate_alphabetic_sequence(index):
result = '' result = ''
while index >= 0: while index >= 0:
@ -238,6 +214,51 @@ def get_user_choice(question, options_dict):
return choice return choice
def update_pytorch_and_python():
print_big_message("Checking for PyTorch updates.")
# Update the Python version. Left here for future reference in case this becomes necessary.
# print_big_message("Checking for PyTorch and Python updates.")
# current_python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
# if current_python_version != PYTHON_VERSION:
# run_cmd(f"conda install -y python={PYTHON_VERSION}", assert_success=True, environment=True)
torver = torch_version()
base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}"
if "+cu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
elif "+rocm" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
elif "+cpu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
elif "+cxx11" in torver:
intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
install_cmd = f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
else:
install_cmd = base_cmd
run_cmd(install_cmd, assert_success=True, environment=True)
def clean_outdated_pytorch_cuda_dependencies():
patterns = ["cu121", "cu122", "torch2.4"]
result = run_cmd("python -m pip list --format=freeze", capture_output=True, environment=True)
matching_packages = []
for line in result.stdout.decode('utf-8').splitlines():
if "==" in line:
pkg_name, version = line.split('==', 1)
if any(pattern in version for pattern in patterns):
matching_packages.append(pkg_name)
if matching_packages:
print(f"\nUninstalling: {', '.join(matching_packages)}\n")
run_cmd(f"python -m pip uninstall -y {' '.join(matching_packages)}", assert_success=True, environment=True)
return matching_packages
def install_webui(): def install_webui():
if os.path.isfile(state_file): if os.path.isfile(state_file):
os.remove(state_file) os.remove(state_file)
@ -323,37 +344,6 @@ def install_webui():
update_requirements(initial_installation=True, pull=False) update_requirements(initial_installation=True, pull=False)
def get_extensions_names():
return [foldername for foldername in os.listdir('extensions') if os.path.isfile(os.path.join('extensions', foldername, 'requirements.txt'))]
def install_extensions_requirements():
print_big_message("Installing extensions requirements.\nSome of these may fail on Windows.\nDon\'t worry if you see error messages, as they will not affect the main program.")
extensions = get_extensions_names()
for i, extension in enumerate(extensions):
print(f"\n\n--- [{i + 1}/{len(extensions)}]: {extension}\n\n")
extension_req_path = os.path.join("extensions", extension, "requirements.txt")
run_cmd(f"python -m pip install -r {extension_req_path} --upgrade", assert_success=False, environment=True)
def clean_outdated_pytorch_cuda_dependencies():
patterns = ["cu121", "cu122", "torch2.4"]
result = run_cmd("python -m pip list --format=freeze", capture_output=True, environment=True)
matching_packages = []
for line in result.stdout.decode('utf-8').splitlines():
if "==" in line:
pkg_name, version = line.split('==', 1)
if any(pattern in version for pattern in patterns):
matching_packages.append(pkg_name)
if matching_packages:
print(f"\nUninstalling: {', '.join(matching_packages)}\n")
run_cmd(f"python -m pip uninstall -y {' '.join(matching_packages)}", assert_success=True, environment=True)
return matching_packages
def update_requirements(initial_installation=False, pull=True): def update_requirements(initial_installation=False, pull=True):
# Create .git directory if missing # Create .git directory if missing
if not os.path.exists(os.path.join(script_dir, ".git")): if not os.path.exists(os.path.join(script_dir, ".git")):
@ -366,14 +356,18 @@ def update_requirements(initial_installation=False, pull=True):
) )
torver = torch_version() torver = torch_version()
requirements_base = os.path.join("requirements", "full")
if "+rocm" in torver: if "+rocm" in torver:
requirements_file = "requirements_amd" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt" file_name = f"requirements_amd{'_noavx2' if not cpu_has_avx2() else ''}.txt"
elif "+cpu" in torver or "+cxx11" in torver: elif "+cpu" in torver or "+cxx11" in torver:
requirements_file = "requirements_cpu_only" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt" file_name = f"requirements_cpu_only{'_noavx2' if not cpu_has_avx2() else ''}.txt"
elif is_macos(): elif is_macos():
requirements_file = "requirements_apple_" + ("intel" if is_x86_64() else "silicon") + ".txt" file_name = f"requirements_apple_{'intel' if is_x86_64() else 'silicon'}.txt"
else: else:
requirements_file = "requirements" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt" file_name = f"requirements{'_noavx2' if not cpu_has_avx2() else ''}.txt"
requirements_file = os.path.join(requirements_base, file_name)
# Load state from JSON file # Load state from JSON file
current_commit = get_current_commit() current_commit = get_current_commit()
@ -475,6 +469,15 @@ def update_requirements(initial_installation=False, pull=True):
clear_cache() clear_cache()
def install_extensions_requirements():
print_big_message("Installing extensions requirements.\nSome of these may fail on Windows.\nDon\'t worry if you see error messages, as they will not affect the main program.")
extensions = get_extensions_names()
for i, extension in enumerate(extensions):
print(f"\n\n--- [{i + 1}/{len(extensions)}]: {extension}\n\n")
extension_req_path = os.path.join("extensions", extension, "requirements.txt")
run_cmd(f"python -m pip install -r {extension_req_path} --upgrade", assert_success=False, environment=True)
def launch_webui(): def launch_webui():
run_cmd(f"python server.py {flags}", environment=True) run_cmd(f"python server.py {flags}", environment=True)

View file

@ -7,7 +7,6 @@ fastapi==0.112.4
gradio==4.37.* gradio==4.37.*
jinja2==3.1.6 jinja2==3.1.6
markdown markdown
numba==0.59.*
numpy==1.26.* numpy==1.26.*
pandas pandas
peft==0.15.* peft==0.15.*
@ -26,14 +25,13 @@ tqdm
wandb wandb
# API # API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14 flask_cloudflared==0.0.14
sse-starlette==1.6.5 sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.* gradio==4.37.*
jinja2==3.1.6 jinja2==3.1.6
markdown markdown
numba==0.59.*
numpy==1.26.* numpy==1.26.*
pandas pandas
peft==0.15.* peft==0.15.*
@ -25,12 +24,11 @@ tqdm
wandb wandb
# API # API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14 flask_cloudflared==0.0.14
sse-starlette==1.6.5 sse-starlette==1.6.5
tiktoken tiktoken
# AMD wheels # AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/rocm/llama_cpp_binaries-0.2.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.* gradio==4.37.*
jinja2==3.1.6 jinja2==3.1.6
markdown markdown
numba==0.59.*
numpy==1.26.* numpy==1.26.*
pandas pandas
peft==0.15.* peft==0.15.*
@ -25,12 +24,11 @@ tqdm
wandb wandb
# API # API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14 flask_cloudflared==0.0.14
sse-starlette==1.6.5 sse-starlette==1.6.5
tiktoken tiktoken
# AMD wheels # AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/rocm/llama_cpp_binaries-0.2.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.* gradio==4.37.*
jinja2==3.1.6 jinja2==3.1.6
markdown markdown
numba==0.59.*
numpy==1.26.* numpy==1.26.*
pandas pandas
peft==0.15.* peft==0.15.*
@ -25,14 +24,12 @@ tqdm
wandb wandb
# API # API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14 flask_cloudflared==0.0.14
sse-starlette==1.6.5 sse-starlette==1.6.5
tiktoken tiktoken
# Mac wheels # Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.* gradio==4.37.*
jinja2==3.1.6 jinja2==3.1.6
markdown markdown
numba==0.59.*
numpy==1.26.* numpy==1.26.*
pandas pandas
peft==0.15.* peft==0.15.*
@ -25,13 +24,13 @@ tqdm
wandb wandb
# API # API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14 flask_cloudflared==0.0.14
sse-starlette==1.6.5 sse-starlette==1.6.5
tiktoken tiktoken
# Mac wheels # Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.* gradio==4.37.*
jinja2==3.1.6 jinja2==3.1.6
markdown markdown
numba==0.59.*
numpy==1.26.* numpy==1.26.*
pandas pandas
peft==0.15.* peft==0.15.*
@ -25,11 +24,10 @@ tqdm
wandb wandb
# API # API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14 flask_cloudflared==0.0.14
sse-starlette==1.6.5 sse-starlette==1.6.5
tiktoken tiktoken
# llama.cpp (CPU only, AVX2) # llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.* gradio==4.37.*
jinja2==3.1.6 jinja2==3.1.6
markdown markdown
numba==0.59.*
numpy==1.26.* numpy==1.26.*
pandas pandas
peft==0.15.* peft==0.15.*
@ -25,11 +24,10 @@ tqdm
wandb wandb
# API # API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14 flask_cloudflared==0.0.14
sse-starlette==1.6.5 sse-starlette==1.6.5
tiktoken tiktoken
# llama.cpp (CPU only, no AVX2) # llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -7,7 +7,6 @@ fastapi==0.112.4
gradio==4.37.* gradio==4.37.*
jinja2==3.1.6 jinja2==3.1.6
markdown markdown
numba==0.59.*
numpy==1.26.* numpy==1.26.*
pandas pandas
peft==0.15.* peft==0.15.*
@ -26,14 +25,13 @@ tqdm
wandb wandb
# API # API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14 flask_cloudflared==0.0.14
sse-starlette==1.6.5 sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.* gradio==4.37.*
jinja2==3.1.6 jinja2==3.1.6
markdown markdown
numba==0.59.*
numpy==1.26.* numpy==1.26.*
pandas pandas
peft==0.15.* peft==0.15.*
@ -25,7 +24,6 @@ tqdm
wandb wandb
# API # API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14 flask_cloudflared==0.0.14
sse-starlette==1.6.5 sse-starlette==1.6.5
tiktoken tiktoken

View file

@ -0,0 +1,19 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -0,0 +1,18 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -0,0 +1,18 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -0,0 +1,19 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"

View file

@ -0,0 +1,20 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"

View file

@ -0,0 +1,19 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -0,0 +1,19 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -0,0 +1,19 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -0,0 +1,15 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken

View file

@ -1,11 +1,8 @@
import os import os
import warnings import warnings
from modules import shared
import accelerate # This early import makes Intel GPUs happy
import modules.one_click_installer_check import modules.one_click_installer_check
from modules import shared
from modules.block_requests import OpenMonkeyPatch, RequestBlocker from modules.block_requests import OpenMonkeyPatch, RequestBlocker
from modules.logging_colors import logger from modules.logging_colors import logger
@ -38,7 +35,6 @@ import yaml
import modules.extensions as extensions_module import modules.extensions as extensions_module
from modules import ( from modules import (
chat,
training, training,
ui, ui,
ui_chat, ui_chat,
@ -89,7 +85,7 @@ def create_interface():
# Force some events to be triggered on page load # Force some events to be triggered on page load
shared.persistent_interface_state.update({ shared.persistent_interface_state.update({
'loader': shared.args.loader or 'Transformers', 'loader': shared.args.loader or 'llama.cpp',
'mode': shared.settings['mode'] if shared.settings['mode'] == 'instruct' else gr.update(), 'mode': shared.settings['mode'] if shared.settings['mode'] == 'instruct' else gr.update(),
'character_menu': shared.args.character or shared.settings['character'], 'character_menu': shared.args.character or shared.settings['character'],
'instruction_template_str': shared.settings['instruction_template_str'], 'instruction_template_str': shared.settings['instruction_template_str'],
@ -218,10 +214,28 @@ if __name__ == "__main__":
if extension not in shared.args.extensions: if extension not in shared.args.extensions:
shared.args.extensions.append(extension) shared.args.extensions.append(extension)
available_models = utils.get_available_models()
# Model defined through --model # Model defined through --model
if shared.args.model is not None: if shared.args.model is not None:
shared.model_name = shared.args.model shared.model_name = shared.args.model
# Select the model from a command-line menu
elif shared.args.model_menu:
if len(available_models) == 0:
logger.error('No models are available! Please download at least one.')
sys.exit(0)
else:
print('The following models are available:\n')
for i, model in enumerate(available_models):
print(f'{i+1}. {model}')
print(f'\nWhich one do you want to load? 1-{len(available_models)}\n')
i = int(input()) - 1
print()
shared.model_name = available_models[i]
# If any model has been selected, load it # If any model has been selected, load it
if shared.model_name != 'None': if shared.model_name != 'None':
p = Path(shared.model_name) p = Path(shared.model_name)

View file

@ -2,6 +2,12 @@
cd "$(dirname "${BASH_SOURCE[0]}")" cd "$(dirname "${BASH_SOURCE[0]}")"
# Portable install case
if [ -d "portable_env" ]; then
./portable_env/bin/python3 server.py --api --auto-launch "$@"
exit $?
fi
if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
# deactivate existing conda envs as needed to avoid conflicts # deactivate existing conda envs as needed to avoid conflicts

View file

@ -2,6 +2,12 @@
cd "$(dirname "${BASH_SOURCE[0]}")" cd "$(dirname "${BASH_SOURCE[0]}")"
# Portable install case
if [ -d "portable_env" ]; then
./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@"
exit $?
fi
if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
# deactivate existing conda envs as needed to avoid conflicts # deactivate existing conda envs as needed to avoid conflicts

View file

@ -3,6 +3,12 @@ setlocal enabledelayedexpansion
cd /D "%~dp0" cd /D "%~dp0"
@rem Portable install case
if exist "portable_env" (
.\portable_env\python.exe server.py --api --auto-launch %*
exit /b %errorlevel%
)
set PATH=%PATH%;%SystemRoot%\system32 set PATH=%PATH%;%SystemRoot%\system32
echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end