Merge pull request #6869 from oobabooga/dev

Merge dev branch
This commit is contained in:
oobabooga 2025-04-22 12:09:20 -03:00 committed by GitHub
commit a778270536
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
48 changed files with 1292 additions and 705 deletions

View file

@ -5,8 +5,14 @@
version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/" # Location of package manifests
- package-ecosystem: "pip"
directory: "/requirements/full/"
target-branch: "dev"
schedule:
interval: "weekly"
- package-ecosystem: "pip"
directory: "/requirements/portable/"
target-branch: "dev"
schedule:
interval: "weekly"

View file

@ -0,0 +1,49 @@
name: Build Everything TGW
on:
workflow_dispatch:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
permissions:
contents: write
jobs:
build_release_cuda_windows:
name: CUDA Windows
uses: ./.github/workflows/build-portable-release-cuda.yml
with:
version: ${{ inputs.version }}
config: 'os:windows-2019'
build_release_cuda_linux:
name: CUDA Linux
uses: ./.github/workflows/build-portable-release-cuda.yml
with:
version: ${{ inputs.version }}
config: 'os:ubuntu-22.04'
build_release_cpu_windows:
name: CPU Windows
uses: ./.github/workflows/build-portable-release.yml
with:
version: ${{ inputs.version }}
config: 'os:windows-2019'
build_release_cpu_linux:
name: CPU Linux
uses: ./.github/workflows/build-portable-release.yml
with:
version: ${{ inputs.version }}
config: 'os:ubuntu-22.04'
build_release_macos:
name: macOS
uses: ./.github/workflows/build-portable-release.yml
with:
version: ${{ inputs.version }}
config: 'os:macos-13,macos-14'

View file

@ -0,0 +1,183 @@
name: Build CUDA
on:
workflow_dispatch:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
config:
description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
default: 'Default'
required: false
type: string
exclude:
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
default: 'None'
required: false
type: string
workflow_call:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
config:
description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
default: 'Default'
required: false
type: string
exclude:
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
default: 'None'
required: false
type: string
permissions:
contents: write
jobs:
define_matrix:
name: Define Build Matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
defaults:
run:
shell: pwsh
env:
CONFIGIN: ${{ inputs.config }}
EXCLUDEIN: ${{ inputs.exclude }}
steps:
- name: Define Job Output
id: set-matrix
run: |
$matrix = @{
'os' = @('ubuntu-22.04', 'windows-2019')
'pyver' = @("3.11")
'avx' = @("AVX2")
'cuda' = @("11.7", "12.4")
}
if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
if ($env:EXCLUDEIN -ne 'None') {
$exclusions = @()
$exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
$matrix['exclude'] = $exclusions
}
$matrixOut = ConvertTo-Json $matrix -Compress
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
build_wheels:
name: ${{ matrix.os }} ${{ matrix.pyver }} CPU ${{ matrix.avx }} CUDA ${{ matrix.cuda }}
needs: define_matrix
runs-on: ${{ matrix.os }}
strategy:
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
defaults:
run:
shell: pwsh
env:
AVXVER: ${{ matrix.avx }}
PCKGVER: ${{ inputs.version }}
steps:
- uses: actions/checkout@v4
with:
repository: 'oobabooga/text-generation-webui'
ref: ${{ inputs.version }}
submodules: 'recursive'
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.pyver }}
- name: Build Package
shell: bash
run: |
rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
# Define common variables
CUDA_VERSION="${{ matrix.cuda }}"
AVX_SUPPORT="${{ matrix.avx }}"
VERSION="${{ inputs.version }}"
# 1. Set platform-specific variables
if [[ "$RUNNER_OS" == "Windows" ]]; then
PLATFORM="windows"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-pc-windows-msvc-install_only.tar.gz"
PIP_PATH="portable_env/python.exe -m pip"
PACKAGES_PATH="portable_env/Lib/site-packages"
ZIP_CMD="powershell -Command \"Compress-Archive -Path text-generation-webui -DestinationPath"
rm start_linux.sh start_macos.sh
else
PLATFORM="linux"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-unknown-linux-gnu-install_only.tar.gz"
PIP_PATH="portable_env/bin/python -m pip"
PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
ZIP_CMD="zip -r"
rm start_macos.sh start_windows.bat
fi
# 2. Download and extract Python
cd ..
echo "Downloading Python for $PLATFORM..."
curl -L -o python-build.tar.gz "$PYTHON_URL"
tar -xzf python-build.tar.gz
mv python text-generation-webui/portable_env
# 3. Prepare requirements file based on AVX and CUDA
if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
BASE_REQ_FILE="requirements/portable/requirements.txt"
else
BASE_REQ_FILE="requirements/portable/requirements_noavx2.txt"
fi
# Create CUDA-specific requirements file if needed
cd text-generation-webui
if [[ "$CUDA_VERSION" == "11.7" ]]; then
echo "Creating CUDA 11.7 specific requirements file"
sed 's/cu124/cu117/g' "$BASE_REQ_FILE" > requirements_cuda_temp.txt
REQ_FILE="requirements_cuda_temp.txt"
else
REQ_FILE="$BASE_REQ_FILE"
fi
# 4. Install packages
echo "Installing Python packages from $REQ_FILE..."
$PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
# 5. Clean up
if [[ "$CUDA_VERSION" == "11.7" ]]; then
rm requirements_cuda_temp.txt
fi
# 6. Create ZIP file
cd ..
VERSION_CLEAN="${VERSION#v}"
ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
echo "Creating archive: $ZIP_NAME"
if [[ "$RUNNER_OS" == "Windows" ]]; then
powershell -Command "Compress-Archive -Path text-generation-webui -DestinationPath $ZIP_NAME"
else
zip -r "$ZIP_NAME" text-generation-webui
fi
- name: Upload files to a GitHub release
id: upload-release
uses: svenstaro/upload-release-action@2.7.0
continue-on-error: true
with:
repo_token: ${{ secrets.GITHUB_TOKEN }}
file: ../textgen-portable-${{ inputs.version }}*.zip
tag: ${{ inputs.version }}
file_glob: true
make_latest: false
overwrite: true

View file

@ -0,0 +1,193 @@
name: Build CPU and macOS
on:
workflow_dispatch:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
config:
description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
default: 'Default'
required: false
type: string
exclude:
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
default: 'None'
required: false
type: string
workflow_call:
inputs:
version:
description: 'Version tag of text-generation-webui to build: v3.0'
default: 'v3.0'
required: true
type: string
config:
description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
default: 'Default'
required: false
type: string
exclude:
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
default: 'None'
required: false
type: string
permissions:
contents: write
jobs:
define_matrix:
name: Define Build Matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
defaults:
run:
shell: pwsh
env:
CONFIGIN: ${{ inputs.config }}
EXCLUDEIN: ${{ inputs.exclude }}
steps:
- name: Define Job Output
id: set-matrix
run: |
$matrix = @{
'os' = @('ubuntu-22.04', 'windows-2019', 'macos-13', 'macos-14')
'pyver' = @("3.11")
'avx' = @("AVX2")
}
if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
if ($env:EXCLUDEIN -ne 'None') {
$exclusions = @()
$exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
$matrix['exclude'] = $exclusions
}
$matrixOut = ConvertTo-Json $matrix -Compress
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
build_wheels:
name: ${{ matrix.os }} ${{ matrix.pyver }} CPU ${{ matrix.avx }}
needs: define_matrix
runs-on: ${{ matrix.os }}
strategy:
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
defaults:
run:
shell: pwsh
env:
AVXVER: ${{ matrix.avx }}
PCKGVER: ${{ inputs.version }}
steps:
- uses: actions/checkout@v4
with:
repository: 'oobabooga/text-generation-webui'
ref: ${{ inputs.version }}
submodules: 'recursive'
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.pyver }}
- name: Build Package
shell: bash
run: |
rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
# Define common variables
AVX_SUPPORT="${{ matrix.avx }}"
VERSION="${{ inputs.version }}"
OS_TYPE="${{ matrix.os }}"
# 1. Set platform-specific variables
if [[ "$RUNNER_OS" == "Windows" ]]; then
PLATFORM="windows-cpu"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-pc-windows-msvc-install_only.tar.gz"
PIP_PATH="portable_env/python.exe -m pip"
PACKAGES_PATH="portable_env/Lib/site-packages"
rm start_linux.sh start_macos.sh
elif [[ "$RUNNER_OS" == "macOS" ]]; then
if [[ "$OS_TYPE" == "macos-13" ]]; then
PLATFORM="macos-x86_64"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-apple-darwin-install_only.tar.gz"
REQ_TYPE="apple_intel"
else
PLATFORM="macos-arm64"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-aarch64-apple-darwin-install_only.tar.gz"
REQ_TYPE="apple_silicon"
fi
PIP_PATH="portable_env/bin/python -m pip"
PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
rm start_linux.sh start_windows.bat
else
# Linux case
PLATFORM="linux-cpu"
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-unknown-linux-gnu-install_only.tar.gz"
PIP_PATH="portable_env/bin/python -m pip"
PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
rm start_macos.sh start_windows.bat
fi
# 2. Download and extract Python
echo "Downloading Python for $PLATFORM..."
cd ..
curl -L -o python-build.tar.gz "$PYTHON_URL"
tar -xzf python-build.tar.gz
mv python text-generation-webui/portable_env
# 3. Prepare requirements file based on platform and AVX
cd text-generation-webui
# Select requirements file based on platform
if [[ "$RUNNER_OS" == "macOS" ]]; then
if [[ "$OS_TYPE" == "macos-13" ]]; then
REQ_FILE="requirements/portable/requirements_apple_intel.txt"
else
REQ_FILE="requirements/portable/requirements_apple_silicon.txt"
fi
else
# For Windows and Linux, check AVX support
if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
REQ_FILE="requirements/portable/requirements_cpu_only.txt"
else
REQ_FILE="requirements/portable/requirements_cpu_only_noavx2.txt"
fi
fi
echo "Using requirements file: $REQ_FILE"
# 4. Install packages
echo "Installing Python packages from $REQ_FILE..."
$PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
# 5. Create ZIP file
cd ..
VERSION_CLEAN="${VERSION#v}"
ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}.zip"
echo "Creating archive: $ZIP_NAME"
if [[ "$RUNNER_OS" == "Windows" ]]; then
powershell -Command "Compress-Archive -Path text-generation-webui -DestinationPath $ZIP_NAME"
else
zip -r "$ZIP_NAME" text-generation-webui
fi
- name: Upload files to a GitHub release
id: upload-release
uses: svenstaro/upload-release-action@2.7.0
continue-on-error: true
with:
repo_token: ${{ secrets.GITHUB_TOKEN }}
file: ../textgen-portable-${{ inputs.version }}*.zip
tag: ${{ inputs.version }}
file_glob: true
make_latest: false
overwrite: true

View file

@ -27,6 +27,14 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
## How to install
#### Option 1: Portable builds
Compatible with GGUF (llama.cpp) models, just unzip and run, no installation. Available for Windows, Linux, and macOS.
Download from: https://github.com/oobabooga/text-generation-webui/releases
#### Option 2: One-click installer
1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip).
2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat`.
3) Select your GPU vendor when asked.
@ -352,6 +360,10 @@ Run `python download-model.py --help` to see all the options.
https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/main/Colab-TextGen-GPU.ipynb
## Community
https://www.reddit.com/r/Oobabooga/
## Acknowledgment
In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.

View file

@ -7,10 +7,7 @@ from io import BytesIO
import requests
import tiktoken
import torch
import torch.nn.functional as F
from PIL import Image
from transformers import LogitsProcessor, LogitsProcessorList
from extensions.openai.errors import InvalidRequestError
from extensions.openai.utils import debug_msg
@ -22,54 +19,7 @@ from modules.chat import (
load_instruction_template_memoized
)
from modules.presets import load_preset_memoized
from modules.text_generation import (
decode,
encode,
generate_reply,
get_reply_from_output_ids
)
class LogitsBiasProcessor(LogitsProcessor):
def __init__(self, logit_bias={}):
self.logit_bias = logit_bias
if self.logit_bias:
self.keys = list([int(key) for key in self.logit_bias.keys()])
values = [self.logit_bias[str(key)] for key in self.keys]
self.values = torch.tensor(values, dtype=torch.float, device=shared.model.device)
debug_msg(f"{self})")
def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
if self.logit_bias:
debug_msg(logits[0, self.keys], " + ", self.values)
logits[0, self.keys] += self.values
debug_msg(" --> ", logits[0, self.keys])
debug_msg(" max/min ", float(torch.max(logits[0])), float(torch.min(logits[0])))
return logits
def __repr__(self):
return f"<{self.__class__.__name__}(logit_bias={self.logit_bias})>"
class LogprobProcessor(LogitsProcessor):
def __init__(self, logprobs=None):
self.logprobs = logprobs
self.token_alternatives = {}
def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
if self.logprobs is not None: # 0-5
log_e_probabilities = F.log_softmax(logits, dim=1)
top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs + 1)
top_tokens = [get_reply_from_output_ids([tok]) for tok in top_indices[0]]
top_probs = [float(x) for x in top_values[0]]
self.token_alternatives = dict(zip(top_tokens, top_probs))
debug_msg(repr(self))
return logits
def __repr__(self):
return f"<{self.__class__.__name__}(logprobs={self.logprobs}, token_alternatives={self.token_alternatives})>"
from modules.text_generation import decode, encode, generate_reply
def convert_logprobs_to_tiktoken(model, logprobs):
@ -107,21 +57,29 @@ def process_parameters(body, is_legacy=False):
elif isinstance(body['stop'], list):
generate_params['custom_stopping_strings'] = body['stop']
logits_processor = []
logit_bias = body.get('logit_bias', None)
if logit_bias: # {str: float, ...}
logits_processor = [LogitsBiasProcessor(logit_bias)]
if shared.args.loader != 'llama.cpp':
from transformers import LogitsProcessorList
logprobs = None # coming to chat eventually
if 'logprobs' in body:
logprobs = body.get('logprobs', 0) # maybe cap at topk? don't clamp 0-5.
generate_params['logprob_proc'] = LogprobProcessor(logprobs)
logits_processor.extend([generate_params['logprob_proc']])
else:
logprobs = None
from modules.transformers_loader import (
LogitsBiasProcessor,
LogprobProcessor
)
if logits_processor: # requires logits_processor support
generate_params['logits_processor'] = LogitsProcessorList(logits_processor)
logits_processor = []
logit_bias = body.get('logit_bias', None)
if logit_bias: # {str: float, ...}
logits_processor = [LogitsBiasProcessor(logit_bias)]
logprobs = None # coming to chat eventually
if 'logprobs' in body:
logprobs = body.get('logprobs', 0) # maybe cap at topk? don't clamp 0-5.
generate_params['logprob_proc'] = LogprobProcessor(logprobs)
logits_processor.extend([generate_params['logprob_proc']])
else:
logprobs = None
if logits_processor: # requires logits_processor support
generate_params['logits_processor'] = LogitsProcessorList(logits_processor)
return generate_params

View file

@ -6,7 +6,6 @@ import traceback
from collections import deque
from threading import Thread
import speech_recognition as sr
import uvicorn
from fastapi import Depends, FastAPI, Header, HTTPException
from fastapi.middleware.cors import CORSMiddleware
@ -16,11 +15,9 @@ from pydub import AudioSegment
from sse_starlette import EventSourceResponse
import extensions.openai.completions as OAIcompletions
import extensions.openai.embeddings as OAIembeddings
import extensions.openai.images as OAIimages
import extensions.openai.logits as OAIlogits
import extensions.openai.models as OAImodels
import extensions.openai.moderations as OAImoderations
from extensions.openai.errors import ServiceUnavailableError
from extensions.openai.tokens import token_count, token_decode, token_encode
from extensions.openai.utils import _start_cloudflared
@ -165,6 +162,8 @@ def handle_billing_usage():
@app.post('/v1/audio/transcriptions', dependencies=check_key)
async def handle_audio_transcription(request: Request):
import speech_recognition as sr
r = sr.Recognizer()
form = await request.form()
@ -211,6 +210,8 @@ async def handle_image_generation(request: Request):
@app.post("/v1/embeddings", response_model=EmbeddingsResponse, dependencies=check_key)
async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
import extensions.openai.embeddings as OAIembeddings
input = request_data.input
if not input:
raise HTTPException(status_code=400, detail="Missing required argument input")
@ -224,6 +225,8 @@ async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
@app.post("/v1/moderations", dependencies=check_key)
async def handle_moderations(request: Request):
import extensions.openai.moderations as OAImoderations
body = await request.json()
input = body["input"]
if not input:

View file

@ -2,7 +2,6 @@ from pathlib import Path
import modules.shared as shared
from modules.logging_colors import logger
from modules.models import get_device
def add_lora_to_model(lora_names):
@ -47,9 +46,10 @@ def add_lora_exllamav2(lora_names):
def add_lora_transformers(lora_names):
from peft import PeftModel
from modules.torch_utils import get_device
prior_set = set(shared.lora_names)
added_set = set(lora_names) - prior_set
removed_set = prior_set - set(lora_names)

View file

@ -2,9 +2,6 @@ import traceback
from queue import Queue
from threading import Thread
import torch
import transformers
import modules.shared as shared
@ -12,25 +9,6 @@ class StopNowException(Exception):
pass
class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
def __init__(self):
transformers.StoppingCriteria.__init__(self)
def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
return shared.stop_everything
class Stream(transformers.StoppingCriteria):
def __init__(self, callback_func=None):
self.callback_func = callback_func
def __call__(self, input_ids, scores) -> bool:
if self.callback_func is not None:
self.callback_func(input_ids[0])
return False
class Iteratorize:
"""

View file

@ -2,13 +2,11 @@ import datetime
from pathlib import Path
import pandas as pd
import torch
from datasets import load_dataset
from tqdm import tqdm
from modules import shared
from modules.logging_colors import logger
from modules.models import clear_torch_cache, load_model, unload_model
from modules.models import load_model, unload_model
from modules.models_settings import get_model_metadata, update_model_parameters
from modules.text_generation import encode
@ -39,6 +37,11 @@ def calculate_perplexity(models, input_dataset, stride, _max_length):
https://huggingface.co/docs/transformers/perplexity#calculating-ppl-with-fixedlength-models
'''
import torch
from datasets import load_dataset
from modules.torch_utils import clear_torch_cache
if shared.args.loader == "llama.cpp":
logger.error("Perplexity evaluation is not implemented for the llama.cpp loader.")
raise ValueError

View file

@ -4,10 +4,6 @@ from pathlib import Path
from typing import Any, Dict, Optional, Union
import torch
from torch.nn import CrossEntropyLoss
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutputWithPast
from exllamav2 import (
ExLlamaV2,
ExLlamaV2Cache,
@ -18,6 +14,15 @@ from exllamav2 import (
ExLlamaV2Cache_TP,
ExLlamaV2Config
)
from torch.nn import CrossEntropyLoss
from transformers import (
GenerationConfig,
GenerationMixin,
PretrainedConfig,
PreTrainedModel
)
from transformers.modeling_outputs import CausalLMOutputWithPast
from modules import shared
from modules.logging_colors import logger
@ -28,7 +33,7 @@ except Exception:
traceback.print_exc()
class Exllamav2HF(PreTrainedModel):
class Exllamav2HF(PreTrainedModel, GenerationMixin):
def __init__(self, config: ExLlamaV2Config):
super().__init__(PretrainedConfig())
self.ex_config = config

View file

@ -6,7 +6,12 @@ from typing import Any, Dict, Optional, Union
import torch
from exllamav3 import Cache, Config, Model
from torch.nn import CrossEntropyLoss
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
from transformers import (
GenerationConfig,
GenerationMixin,
PretrainedConfig,
PreTrainedModel
)
from transformers.modeling_outputs import CausalLMOutputWithPast
from modules import shared
@ -19,7 +24,7 @@ except Exception:
traceback.print_exc()
class Exllamav3HF(PreTrainedModel):
class Exllamav3HF(PreTrainedModel, GenerationMixin):
def __init__(self, model_dir):
super().__init__(PretrainedConfig())
self.generation_config = GenerationConfig()

View file

@ -1,4 +1,5 @@
import json
import os
import pprint
import socket
import subprocess
@ -281,12 +282,21 @@ class LlamaServer:
if shared.args.rope_freq_base > 0:
cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)]
env = os.environ.copy()
if os.name == 'posix':
current_path = env.get('LD_LIBRARY_PATH', '')
if current_path:
env['LD_LIBRARY_PATH'] = f"{current_path}:{os.path.dirname(self.server_path)}"
else:
env['LD_LIBRARY_PATH'] = os.path.dirname(self.server_path)
# Start the server with pipes for output
self.process = subprocess.Popen(
cmd,
stderr=subprocess.PIPE,
text=True,
bufsize=1
bufsize=1,
env=env
)
def filter_stderr(process_stderr):

View file

@ -3,29 +3,7 @@ from collections import OrderedDict
import gradio as gr
from modules import shared
loaders_and_params = OrderedDict({
'Transformers': [
'gpu_memory',
'cpu_memory',
'alpha_value',
'compress_pos_emb',
'compute_dtype',
'quant_type',
'load_in_8bit',
'load_in_4bit',
'torch_compile',
'use_flash_attention_2',
'auto_devices',
'cpu',
'disk',
'use_double_quant',
'use_eager_attention',
'bf16',
'trust_remote_code',
'no_use_fast',
],
'llama.cpp': [
'n_gpu_layers',
'threads',
@ -43,6 +21,25 @@ loaders_and_params = OrderedDict({
'mlock',
'numa',
],
'Transformers': [
'gpu_split',
'cpu_memory',
'alpha_value',
'compress_pos_emb',
'compute_dtype',
'quant_type',
'load_in_8bit',
'load_in_4bit',
'torch_compile',
'use_flash_attention_2',
'cpu',
'disk',
'use_double_quant',
'use_eager_attention',
'bf16',
'trust_remote_code',
'no_use_fast',
],
'ExLlamav3_HF': [
'max_seq_len',
'gpu_split',
@ -346,10 +343,6 @@ def blacklist_samplers(loader, dynamic_temperature):
return output
def get_gpu_memory_keys():
return [k for k in shared.gradio if k.startswith('gpu_memory')]
@functools.cache
def get_all_params():
all_params = set()
@ -357,11 +350,6 @@ def get_all_params():
for el in loaders_and_params[k]:
all_params.add(el)
if 'gpu_memory' in all_params:
all_params.remove('gpu_memory')
for k in get_gpu_memory_keys():
all_params.add(k)
return sorted(all_params)
@ -371,8 +359,4 @@ def make_loader_params_visible(loader):
if loader in loaders_and_params:
params = loaders_and_params[loader]
if 'gpu_memory' in params:
params.remove('gpu_memory')
params += get_gpu_memory_keys()
return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params]

View file

@ -2,11 +2,10 @@ import time
import traceback
import numpy as np
import torch
from modules import models, sampler_hijack, shared
from modules import models, shared
from modules.logging_colors import logger
from modules.models import get_device, load_model
from modules.models import load_model
from modules.text_generation import generate_reply
global_scores = None
@ -38,18 +37,16 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
logger.error("No model is loaded! Select one in the Model tab.")
return 'Error: No model is loaded1 Select one in the Model tab.', previous
is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
is_llamacpp = shared.model.__class__.__name__ == 'LlamaServer'
if is_llamacpp:
# llama.cpp case
if shared.model.__class__.__name__ == 'LlamaServer':
logprobs = shared.model.get_logits(prompt, state, n_probs=top_logits, use_samplers=use_samplers)
if return_dict:
output = {}
for entry in logprobs:
token = repr(entry['token'])
prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
output[token] = prob
return output
else:
output = ''
@ -57,9 +54,17 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
token = repr(entry['token'])
prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
output += f"{prob:.5f} - {token}\n"
return output, previous
# All other model types
else:
import torch
from modules import sampler_hijack
from modules.torch_utils import get_device
is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
if not use_samplers:
state = {'stream': True}

View file

@ -1,61 +1,11 @@
import gc
import os
import pprint
import re
import sys
import time
from pathlib import Path
import torch
import transformers
from accelerate import infer_auto_device_map, init_empty_weights
from accelerate.utils import (
is_ccl_available,
is_npu_available,
is_xpu_available
)
from transformers import (
AutoConfig,
AutoModel,
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoTokenizer,
BitsAndBytesConfig,
is_torch_npu_available,
is_torch_xpu_available
)
import modules.shared as shared
from modules.logging_colors import logger
from modules.models_settings import get_model_metadata
transformers.logging.set_verbosity_error()
local_rank = None
if shared.args.deepspeed:
import deepspeed
from transformers.integrations.deepspeed import (
HfDeepSpeedConfig,
is_deepspeed_zero3_enabled
)
from modules.deepspeed_parameters import generate_ds_config
# Distributed setup
local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))
if is_xpu_available() and is_ccl_available():
torch.xpu.set_device(local_rank)
deepspeed.init_distributed(backend="ccl")
elif is_npu_available():
torch.npu.set_device(local_rank)
deepspeed.init_distributed(dist_backend="hccl")
else:
torch.cuda.set_device(local_rank)
deepspeed.init_distributed()
ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration
last_generation_time = time.time()
@ -66,8 +16,8 @@ def load_model(model_name, loader=None):
shared.is_seq2seq = False
shared.model_name = model_name
load_func_map = {
'Transformers': huggingface_loader,
'llama.cpp': llama_cpp_server_loader,
'Transformers': transformers_loader,
'ExLlamav3_HF': ExLlamav3_HF_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader,
'ExLlamav2': ExLlamav2_loader,
@ -85,8 +35,11 @@ def load_model(model_name, loader=None):
logger.error('The path to the model does not exist. Exiting.')
raise ValueError
if loader != 'llama.cpp' and 'sampler_hijack' not in sys.modules:
from modules import sampler_hijack
sampler_hijack.hijack_samplers()
shared.args.loader = loader
clear_torch_cache()
output = load_func_map[loader](model_name)
if type(output) is tuple:
model, tokenizer = output
@ -95,6 +48,7 @@ def load_model(model_name, loader=None):
if model is None:
return None, None
else:
from modules.transformers_loader import load_tokenizer
tokenizer = load_tokenizer(model_name)
shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
@ -110,163 +64,6 @@ def load_model(model_name, loader=None):
return model, tokenizer
def load_tokenizer(model_name, tokenizer_dir=None):
if tokenizer_dir:
path_to_model = Path(tokenizer_dir)
else:
path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
tokenizer = None
if path_to_model.exists():
if shared.args.no_use_fast:
logger.info('Loading the tokenizer with use_fast=False.')
tokenizer = AutoTokenizer.from_pretrained(
path_to_model,
trust_remote_code=shared.args.trust_remote_code,
use_fast=not shared.args.no_use_fast
)
return tokenizer
def huggingface_loader(model_name):
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
params = {
'low_cpu_mem_usage': True,
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
}
if shared.args.trust_remote_code:
params['trust_remote_code'] = True
if shared.args.use_flash_attention_2:
params['use_flash_attention_2'] = True
if shared.args.force_safetensors:
params['force_safetensors'] = True
if shared.args.use_eager_attention:
params['attn_implementation'] = 'eager'
config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
if 'chatglm' in model_name.lower():
LoaderClass = AutoModel
else:
if config.to_dict().get('is_encoder_decoder', False):
LoaderClass = AutoModelForSeq2SeqLM
shared.is_seq2seq = True
else:
LoaderClass = AutoModelForCausalLM
# Determine if we should use default loading
should_use_default_loading = not any([
shared.args.cpu,
shared.args.load_in_8bit,
shared.args.load_in_4bit,
shared.args.auto_devices,
shared.args.disk,
shared.args.deepspeed,
shared.args.gpu_memory is not None,
shared.args.cpu_memory is not None,
shared.args.compress_pos_emb > 1,
shared.args.alpha_value > 1,
])
# Load the model without any special settings
if should_use_default_loading:
logger.info("TRANSFORMERS_PARAMS=")
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
print()
model = LoaderClass.from_pretrained(path_to_model, **params)
if not (hasattr(model, 'is_loaded_in_4bit') and model.is_loaded_in_4bit):
device = get_device()
if device:
model = model.to(device)
# DeepSpeed ZeRO-3
elif shared.args.deepspeed:
model = LoaderClass.from_pretrained(
path_to_model,
torch_dtype=params['torch_dtype'],
trust_remote_code=params.get('trust_remote_code')
)
model = deepspeed.initialize(
model=model,
config_params=ds_config,
model_parameters=None,
optimizer=None,
lr_scheduler=None
)[0]
model.module.eval() # Inference
logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')
# Load with quantization and/or offloading
else:
if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())):
logger.warning('torch.cuda.is_available() and is_xpu_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.')
shared.args.cpu = True
if shared.args.cpu:
params['torch_dtype'] = torch.float32
else:
params['device_map'] = 'auto'
if x := get_max_memory_dict():
params['max_memory'] = x
if shared.args.load_in_4bit:
# See https://github.com/huggingface/transformers/pull/23479/files
# and https://huggingface.co/blog/4bit-transformers-bitsandbytes
quantization_config_params = {
'load_in_4bit': True,
'bnb_4bit_compute_dtype': eval(f"torch.{shared.args.compute_dtype}") if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
'bnb_4bit_quant_type': shared.args.quant_type,
'bnb_4bit_use_double_quant': shared.args.use_double_quant,
'llm_int8_enable_fp32_cpu_offload': True
}
params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)
elif shared.args.load_in_8bit:
if shared.args.auto_devices or shared.args.gpu_memory:
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
else:
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True)
if params.get('max_memory') is not None:
with init_empty_weights():
model = LoaderClass.from_config(config, trust_remote_code=params.get('trust_remote_code'))
model.tie_weights()
params['device_map'] = infer_auto_device_map(
model,
dtype=torch.int8,
max_memory=params.get('max_memory'),
no_split_module_classes=model._no_split_modules
)
if shared.args.disk:
params['offload_folder'] = shared.args.disk_cache_dir
if shared.args.compress_pos_emb > 1:
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
elif shared.args.alpha_value > 1:
params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
logger.info("TRANSFORMERS_PARAMS=")
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
print()
model = LoaderClass.from_pretrained(path_to_model, **params)
if shared.args.torch_compile:
model = torch.compile(model)
return model
def llama_cpp_server_loader(model_name):
from modules.llama_cpp_server import LlamaServer
@ -284,6 +81,11 @@ def llama_cpp_server_loader(model_name):
logger.error(f"Error loading the model with llama.cpp: {str(e)}")
def transformers_loader(model_name):
from modules.transformers_loader import load_model_HF
return load_model_HF(model_name)
def ExLlamav3_HF_loader(model_name):
from modules.exllamav3_hf import Exllamav3HF
@ -328,71 +130,18 @@ def TensorRT_LLM_loader(model_name):
return model
def get_max_memory_dict():
max_memory = {}
max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
if shared.args.gpu_memory:
memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory))
for i in range(len(memory_map)):
max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
# If --auto-devices is provided standalone, try to get a reasonable value
# for the maximum memory of device :0
elif shared.args.auto_devices:
if is_xpu_available():
total_mem = (torch.xpu.get_device_properties(0).total_memory / (1024 * 1024))
else:
total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
suggestion = round((total_mem - 1000) / 1000) * 1000
if total_mem - suggestion < 800:
suggestion -= 1000
suggestion = int(round(suggestion / 1000))
logger.warning(f"Auto-assiging --gpu-memory {suggestion} for your GPU to try to prevent out-of-memory errors. You can manually set other values.")
max_memory[0] = f'{suggestion}GiB'
max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
return max_memory if len(max_memory) > 0 else None
def get_device():
if torch.cuda.is_available():
return torch.device('cuda')
elif shared.args.deepspeed:
import deepspeed
return deepspeed.get_accelerator().current_device_name()
elif torch.backends.mps.is_available():
return torch.device('mps')
elif is_torch_xpu_available():
return torch.device('xpu:0')
elif is_torch_npu_available():
return torch.device('npu:0')
else:
return None
def clear_torch_cache():
gc.collect()
if not shared.args.cpu:
if torch.cuda.is_available():
torch.cuda.empty_cache()
elif is_xpu_available():
torch.xpu.empty_cache()
elif is_npu_available():
torch.npu.empty_cache()
elif torch.backends.mps.is_available():
if hasattr(torch.backends.mps, 'empty_cache'):
torch.backends.mps.empty_cache()
def unload_model(keep_model_name=False):
if shared.model is None:
return
is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
shared.model = shared.tokenizer = None
shared.lora_names = []
shared.model_dirty_from_training = False
clear_torch_cache()
if not is_llamacpp:
from modules.torch_utils import clear_torch_cache
clear_torch_cache()
if not keep_model_name:
shared.model_name = 'None'

View file

@ -188,41 +188,20 @@ def update_model_parameters(state, initial=False):
UI: update the command-line arguments based on the interface values
'''
elements = ui.list_model_elements() # the names of the parameters
gpu_memories = []
for i, element in enumerate(elements):
if element not in state:
continue
value = state[element]
if element.startswith('gpu_memory'):
gpu_memories.append(value)
continue
if initial and element in shared.provided_arguments:
continue
if element in ['cpu_memory'] and value == 0:
if element == 'cpu_memory' and value == 0:
value = vars(shared.args_defaults)[element]
# Making some simple conversions
if element == 'cpu_memory' and value is not None:
value = f"{value}MiB"
setattr(shared.args, element, value)
found_positive = False
for i in gpu_memories:
if i > 0:
found_positive = True
break
if not (initial and vars(shared.args)['gpu_memory'] != vars(shared.args_defaults)['gpu_memory']):
if found_positive:
shared.args.gpu_memory = [f"{i}MiB" for i in gpu_memories]
else:
shared.args.gpu_memory = None
def apply_model_settings_to_state(model, state):
'''

View file

@ -13,7 +13,10 @@ from transformers.generation.logits_process import (
from modules import shared
from modules.logging_colors import logger
from modules.models import get_device
from modules.torch_utils import get_device
original_init = transformers.GenerationConfig.__init__
original_get_logits_processor = transformers.GenerationMixin._get_logits_processor
global_scores = None
@ -484,7 +487,7 @@ def get_logits_processor_patch(self, **kwargs):
generation_config.temperature = float(generation_config.temperature) # Must be float
# Get the original warpers
warpers = self._get_logits_processor_old(**kwargs)
warpers = original_get_logits_processor(self, **kwargs)
for i in range(len(warpers) - 1, -1, -1):
# Replace temperature with our modified class.
@ -674,7 +677,7 @@ def get_logits_processor_patch(self, **kwargs):
def generation_config_init_patch(self, **kwargs):
self.__init___old(**kwargs)
original_init(self, **kwargs)
self.min_p = kwargs.pop("min_p", 0.0)
self.dynamic_temperature = kwargs.pop("dynamic_temperature", False)
self.dynatemp_low = kwargs.pop("dynatemp_low", 1)
@ -702,8 +705,5 @@ def generation_config_init_patch(self, **kwargs):
def hijack_samplers():
transformers.GenerationMixin._get_logits_processor_old = transformers.GenerationMixin._get_logits_processor
transformers.GenerationMixin._get_logits_processor = get_logits_processor_patch
transformers.GenerationConfig.__init___old = transformers.GenerationConfig.__init__
transformers.GenerationConfig.__init__ = generation_config_init_patch

View file

@ -79,6 +79,7 @@ group.add_argument('--model', type=str, help='Name of the model to load by defau
group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.')
group.add_argument('--model-dir', type=str, default='models/', help='Path to directory with all the models.')
group.add_argument('--lora-dir', type=str, default='loras/', help='Path to directory with all the loras.')
group.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.')
group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
@ -91,9 +92,7 @@ group.add_argument('--loader', type=str, help='Choose the model loader manually,
# Transformers/Accelerate
group = parser.add_argument_group('Transformers/Accelerate')
group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
group.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
group.add_argument('--gpu-memory', type=str, nargs='+', help='Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.')
group.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.')
group.add_argument('--cpu-memory', type=float, default=0, help='Maximum CPU memory in GiB. Use this for CPU offloading.')
group.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
group.add_argument('--disk-cache-dir', type=str, default='cache', help='Directory to save the disk cache to. Defaults to "cache".')
group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).')

View file

@ -7,33 +7,18 @@ import time
import traceback
import numpy as np
import torch
import transformers
from transformers import (
LogitsProcessorList,
is_torch_npu_available,
is_torch_xpu_available
)
import modules.shared as shared
from modules import models, sampler_hijack
from modules.callbacks import (
Iteratorize,
Stream,
_StopEverythingStoppingCriteria
)
from modules import models
from modules.callbacks import Iteratorize
from modules.extensions import apply_extensions
from modules.grammar.grammar_utils import initialize_grammar
from modules.grammar.logits_process import GrammarConstrainedLogitsProcessor
from modules.html_generator import generate_basic_html
from modules.logging_colors import logger
from modules.models import clear_torch_cache, get_device, load_model
sampler_hijack.hijack_samplers()
def generate_reply(*args, **kwargs):
if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
from modules.models import load_model
shared.model, shared.tokenizer = load_model(shared.model_name)
shared.generation_lock.acquire()
@ -46,7 +31,6 @@ def generate_reply(*args, **kwargs):
def _generate_reply(question, state, stopping_strings=None, is_chat=False, escape_html=False, for_ui=False):
# Find the appropriate generation function
generate_func = apply_extensions('custom_generate_reply')
if generate_func is None:
@ -80,7 +64,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
all_stop_strings += st
shared.stop_everything = False
seed = set_manual_seed(state['seed'])
last_update = -1
reply = ''
is_stream = state['stream']
@ -93,7 +76,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
min_update_interval = 1 / state['max_updates_second']
# Generate
for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat):
reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
if escape_html:
reply = html.escape(reply)
@ -132,44 +115,55 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
if shared.tokenizer is None:
raise ValueError('No tokenizer is loaded')
if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel']:
if shared.model.__class__.__name__ == 'LlamaServer':
input_ids = shared.tokenizer.encode(str(prompt), add_bos_token=add_bos_token)
else:
# llama.cpp case
if shared.model.__class__.__name__ == 'LlamaServer':
input_ids = shared.tokenizer.encode(str(prompt), add_bos_token=add_bos_token)
input_ids = np.array(input_ids).reshape(1, len(input_ids))
if truncation_length is not None:
input_ids = input_ids[:, -truncation_length:]
return input_ids
# All other model types
else:
import torch
from modules.torch_utils import get_device
if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel']:
input_ids = shared.tokenizer.encode(str(prompt))
if shared.model.__class__.__name__ != 'Exllamav2Model':
input_ids = np.array(input_ids).reshape(1, len(input_ids))
else:
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
if shared.model.__class__.__name__ not in ['Exllamav2Model']:
input_ids = np.array(input_ids).reshape(1, len(input_ids))
else:
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None:
if add_bos_token:
# Add BOS token if missing
if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0:
bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]])
input_ids = torch.cat((bos_tensor, input_ids), 1)
if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None:
if add_bos_token:
if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0:
# Add a missing bos token (it may not have been added due to faulty model metadata)
bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]])
input_ids = torch.cat((bos_tensor, input_ids), 1)
# Prevent double BOS tokens from jinja templates
while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id:
input_ids = input_ids[:, 1:]
else:
# Remove BOS tokens when not wanted
while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id:
input_ids = input_ids[:, 1:]
# Prevent double bos token due to jinja templates with <s> somewhere
while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id:
input_ids = input_ids[:, 1:]
else:
# Remove any bos token that may have been added
while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id:
input_ids = input_ids[:, 1:]
if truncation_length is not None:
input_ids = input_ids[:, -truncation_length:]
# Handling truncation
if truncation_length is not None:
input_ids = input_ids[:, -truncation_length:]
if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
return input_ids
else:
device = get_device()
if device:
return input_ids.to(device)
if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
return input_ids
else:
device = get_device()
if device:
return input_ids.to(device)
return input_ids
return input_ids
def decode(output_ids, skip_special_tokens=True):
@ -225,13 +219,17 @@ def set_manual_seed(seed):
if seed == -1:
seed = random.randint(1, 2**31)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
elif is_torch_xpu_available():
torch.xpu.manual_seed_all(seed)
elif is_torch_npu_available():
torch.npu.manual_seed_all(seed)
if shared.args.loader != 'llama.cpp':
import torch
from transformers import is_torch_npu_available, is_torch_xpu_available
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
elif is_torch_xpu_available():
torch.xpu.manual_seed_all(seed)
elif is_torch_npu_available():
torch.npu.manual_seed_all(seed)
return seed
@ -285,10 +283,26 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
return reply
def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
def generate_reply_HF(question, original_question, state, stopping_strings=None, is_chat=False):
import torch
import transformers
from transformers import LogitsProcessorList
from modules.grammar.grammar_utils import initialize_grammar
from modules.grammar.logits_process import (
GrammarConstrainedLogitsProcessor
)
from modules.torch_utils import clear_torch_cache, get_device
from modules.transformers_loader import (
Stream,
_StopEverythingStoppingCriteria
)
if shared.args.loader == 'Transformers':
clear_torch_cache()
seed = set_manual_seed(state['seed'])
generate_params = {}
for k in [
'temperature',
@ -458,12 +472,12 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
return
def generate_reply_custom(question, original_question, seed, state, stopping_strings=None, is_chat=False):
def generate_reply_custom(question, original_question, state, stopping_strings=None, is_chat=False):
"""
For models that do not use the transformers library for sampling
"""
seed = set_manual_seed(state['seed'])
seed = set_manual_seed(state['seed'])
t0 = time.time()
reply = ''
try:

37
modules/torch_utils.py Normal file
View file

@ -0,0 +1,37 @@
import gc
import torch
from accelerate.utils import is_npu_available, is_xpu_available
from transformers import is_torch_npu_available, is_torch_xpu_available
from modules import shared
def get_device():
if torch.cuda.is_available():
return torch.device('cuda')
elif shared.args.deepspeed:
import deepspeed
return deepspeed.get_accelerator().current_device_name()
elif torch.backends.mps.is_available():
return torch.device('mps')
elif is_torch_xpu_available():
return torch.device('xpu:0')
elif is_torch_npu_available():
return torch.device('npu:0')
else:
return None
def clear_torch_cache():
gc.collect()
if not shared.args.cpu:
if torch.cuda.is_available():
torch.cuda.empty_cache()
elif is_xpu_available():
torch.xpu.empty_cache()
elif is_npu_available():
torch.npu.empty_cache()
elif torch.backends.mps.is_available():
if hasattr(torch.backends.mps, 'empty_cache'):
torch.backends.mps.empty_cache()

View file

@ -15,13 +15,6 @@ from datetime import datetime
from pathlib import Path
import gradio as gr
import torch
import transformers
from datasets import Dataset, load_dataset
from transformers import is_torch_xpu_available
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
)
from modules import shared, ui, utils
from modules.evaluate import (
@ -33,7 +26,6 @@ from modules.logging_colors import logger
from modules.models import reload_model
from modules.utils import natural_keys
MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()}
PARAMETERS = ["lora_name", "always_override", "q_proj_en", "v_proj_en", "k_proj_en", "o_proj_en", "gate_proj_en", "down_proj_en", "up_proj_en", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to"]
WANT_INTERRUPT = False
@ -284,6 +276,9 @@ def calc_trainable_parameters(model):
def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str):
import torch
import transformers
from datasets import Dataset, load_dataset
from peft import (
LoraConfig,
get_peft_model,
@ -293,6 +288,12 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
from peft.utils.other import \
TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as \
model_to_lora_modules
from transformers import is_torch_xpu_available
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
)
MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()}
global WANT_INTERRUPT
WANT_INTERRUPT = False

View file

@ -0,0 +1,279 @@
import os
import pprint
from pathlib import Path
import torch
import torch.nn.functional as F
import transformers
from accelerate import infer_auto_device_map, init_empty_weights
from accelerate.utils import (
is_ccl_available,
is_npu_available,
is_xpu_available
)
from transformers import (
AutoConfig,
AutoModel,
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoTokenizer,
BitsAndBytesConfig,
LogitsProcessor
)
import modules.shared as shared
from modules.logging_colors import logger
from modules.text_generation import get_reply_from_output_ids
from modules.torch_utils import get_device
transformers.logging.set_verbosity_error()
local_rank = None
if shared.args.deepspeed:
import deepspeed
from transformers.integrations.deepspeed import (
HfDeepSpeedConfig,
is_deepspeed_zero3_enabled
)
from modules.deepspeed_parameters import generate_ds_config
# Distributed setup
local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))
if is_xpu_available() and is_ccl_available():
torch.xpu.set_device(local_rank)
deepspeed.init_distributed(backend="ccl")
elif is_npu_available():
torch.npu.set_device(local_rank)
deepspeed.init_distributed(dist_backend="hccl")
else:
torch.cuda.set_device(local_rank)
deepspeed.init_distributed()
ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration
class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
def __init__(self):
transformers.StoppingCriteria.__init__(self)
def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
return shared.stop_everything
class Stream(transformers.StoppingCriteria):
def __init__(self, callback_func=None):
self.callback_func = callback_func
def __call__(self, input_ids, scores) -> bool:
if self.callback_func is not None:
self.callback_func(input_ids[0])
return False
class LogitsBiasProcessor(LogitsProcessor):
def __init__(self, logit_bias={}):
self.logit_bias = logit_bias
if self.logit_bias:
self.keys = list([int(key) for key in self.logit_bias.keys()])
values = [self.logit_bias[str(key)] for key in self.keys]
self.values = torch.tensor(values, dtype=torch.float, device=shared.model.device)
def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
if self.logit_bias:
logits[0, self.keys] += self.values
return logits
def __repr__(self):
return f"<{self.__class__.__name__}(logit_bias={self.logit_bias})>"
class LogprobProcessor(LogitsProcessor):
def __init__(self, logprobs=None):
self.logprobs = logprobs
self.token_alternatives = {}
def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
if self.logprobs is not None: # 0-5
log_e_probabilities = F.log_softmax(logits, dim=1)
top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs + 1)
top_tokens = [get_reply_from_output_ids([tok]) for tok in top_indices[0]]
top_probs = [float(x) for x in top_values[0]]
self.token_alternatives = dict(zip(top_tokens, top_probs))
return logits
def __repr__(self):
return f"<{self.__class__.__name__}(logprobs={self.logprobs}, token_alternatives={self.token_alternatives})>"
def load_tokenizer(model_name, tokenizer_dir=None):
if tokenizer_dir:
path_to_model = Path(tokenizer_dir)
else:
path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
tokenizer = None
if path_to_model.exists():
if shared.args.no_use_fast:
logger.info('Loading the tokenizer with use_fast=False.')
tokenizer = AutoTokenizer.from_pretrained(
path_to_model,
trust_remote_code=shared.args.trust_remote_code,
use_fast=not shared.args.no_use_fast
)
return tokenizer
def load_model_HF(model_name):
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
params = {
'low_cpu_mem_usage': True,
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
}
if shared.args.trust_remote_code:
params['trust_remote_code'] = True
if shared.args.use_flash_attention_2:
params['use_flash_attention_2'] = True
if shared.args.force_safetensors:
params['force_safetensors'] = True
if shared.args.use_eager_attention:
params['attn_implementation'] = 'eager'
config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
if 'chatglm' in model_name.lower():
LoaderClass = AutoModel
else:
if config.to_dict().get('is_encoder_decoder', False):
LoaderClass = AutoModelForSeq2SeqLM
shared.is_seq2seq = True
else:
LoaderClass = AutoModelForCausalLM
# Determine if we should use default loading
should_use_default_loading = not any([
shared.args.cpu,
shared.args.load_in_8bit,
shared.args.load_in_4bit,
shared.args.disk,
shared.args.deepspeed,
shared.args.cpu_memory is not None,
shared.args.compress_pos_emb > 1,
shared.args.alpha_value > 1,
])
# Load the model without any special settings
if should_use_default_loading:
params['device_map'] = 'auto'
logger.info("TRANSFORMERS_PARAMS=")
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
print()
model = LoaderClass.from_pretrained(path_to_model, **params)
if not (hasattr(model, 'is_loaded_in_4bit') and model.is_loaded_in_4bit):
device = get_device()
if device:
model = model.to(device)
# DeepSpeed ZeRO-3
elif shared.args.deepspeed:
model = LoaderClass.from_pretrained(
path_to_model,
torch_dtype=params['torch_dtype'],
trust_remote_code=params.get('trust_remote_code')
)
model = deepspeed.initialize(
model=model,
config_params=ds_config,
model_parameters=None,
optimizer=None,
lr_scheduler=None
)[0]
model.module.eval() # Inference
logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')
# Load with quantization and/or offloading
else:
if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())):
logger.warning('torch.cuda.is_available() and is_xpu_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.')
shared.args.cpu = True
if shared.args.cpu:
params['torch_dtype'] = torch.float32
else:
params['device_map'] = 'auto'
if x := get_max_memory_dict():
params['max_memory'] = x
if shared.args.load_in_4bit:
# See https://github.com/huggingface/transformers/pull/23479/files
# and https://huggingface.co/blog/4bit-transformers-bitsandbytes
quantization_config_params = {
'load_in_4bit': True,
'bnb_4bit_compute_dtype': eval(f"torch.{shared.args.compute_dtype}") if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
'bnb_4bit_quant_type': shared.args.quant_type,
'bnb_4bit_use_double_quant': shared.args.use_double_quant,
'llm_int8_enable_fp32_cpu_offload': True
}
params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)
elif shared.args.load_in_8bit:
if shared.args.gpu_split:
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
else:
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True)
if params.get('max_memory') is not None:
with init_empty_weights():
model = LoaderClass.from_config(config, trust_remote_code=params.get('trust_remote_code'))
model.tie_weights()
params['device_map'] = infer_auto_device_map(
model,
dtype=torch.int8,
max_memory=params.get('max_memory'),
no_split_module_classes=model._no_split_modules
)
if shared.args.disk:
params['offload_folder'] = shared.args.disk_cache_dir
if shared.args.compress_pos_emb > 1:
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
elif shared.args.alpha_value > 1:
params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
logger.info("TRANSFORMERS_PARAMS=")
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
print()
model = LoaderClass.from_pretrained(path_to_model, **params)
if shared.args.torch_compile:
model = torch.compile(model)
return model
def get_max_memory_dict():
max_memory = {}
if shared.args.cpu_memory > 0:
max_memory['cpu'] = f'{shared.args.cpu_memory}GiB'
if shared.args.gpu_split:
for i, memory in enumerate(shared.args.gpu_split.split(',')):
max_memory[i] = f'{memory}GiB'
return max_memory if len(max_memory) > 0 else None

View file

@ -2,9 +2,7 @@ import copy
from pathlib import Path
import gradio as gr
import torch
import yaml
from transformers import is_torch_xpu_available
import extensions
from modules import shared
@ -128,7 +126,6 @@ def list_model_elements():
'torch_compile',
'flash_attn',
'use_flash_attention_2',
'auto_devices',
'cpu',
'disk',
'row_split',
@ -150,13 +147,6 @@ def list_model_elements():
'no_use_fast',
]
if is_torch_xpu_available():
for i in range(torch.xpu.device_count()):
elements.append(f'gpu_memory_{i}')
else:
for i in range(torch.cuda.device_count()):
elements.append(f'gpu_memory_{i}')
return elements

View file

@ -1,14 +1,9 @@
import importlib
import math
import re
import traceback
from functools import partial
from pathlib import Path
import gradio as gr
import psutil
import torch
from transformers import is_torch_npu_available, is_torch_xpu_available
from modules import loaders, shared, ui, utils
from modules.logging_colors import logger
@ -27,35 +22,6 @@ from modules.utils import gradio
def create_ui():
mu = shared.args.multi_user
# Finding the default values for the GPU and CPU memories
total_mem = []
if is_torch_xpu_available():
for i in range(torch.xpu.device_count()):
total_mem.append(math.floor(torch.xpu.get_device_properties(i).total_memory / (1024 * 1024)))
elif is_torch_npu_available():
for i in range(torch.npu.device_count()):
total_mem.append(math.floor(torch.npu.get_device_properties(i).total_memory / (1024 * 1024)))
else:
for i in range(torch.cuda.device_count()):
total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))
default_gpu_mem = []
if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
for i in shared.args.gpu_memory:
if 'mib' in i.lower():
default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
else:
default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
while len(default_gpu_mem) < len(total_mem):
default_gpu_mem.append(0)
total_cpu_mem = math.floor(psutil.virtual_memory().total / (1024 * 1024))
if shared.args.cpu_memory is not None:
default_cpu_mem = re.sub('[a-zA-Z ]', '', shared.args.cpu_memory)
else:
default_cpu_mem = 0
with gr.Tab("Model", elem_id="model-tab"):
with gr.Row():
with gr.Column():
@ -80,10 +46,6 @@ def create_ui():
with gr.Blocks():
with gr.Row():
with gr.Column():
for i in range(len(total_mem)):
shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i])
shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem)
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
@ -94,6 +56,7 @@ def create_ui():
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
@ -107,7 +70,6 @@ def create_ui():
shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')

View file

@ -15,7 +15,6 @@ import sys
# os.environ["HSA_OVERRIDE_GFX_VERSION"] = '10.3.0'
# os.environ["HCC_AMDGPU_TARGET"] = 'gfx1030'
# Define the required versions
TORCH_VERSION = "2.6.0"
TORCHVISION_VERSION = "0.21.0"
@ -62,6 +61,19 @@ def is_x86_64():
return platform.machine() == "x86_64"
def is_installed():
site_packages_path = None
for sitedir in site.getsitepackages():
if "site-packages" in sitedir and conda_env_path in sitedir:
site_packages_path = sitedir
break
if site_packages_path:
return os.path.isfile(os.path.join(site_packages_path, 'torch', '__init__.py'))
else:
return os.path.isdir(conda_env_path)
def cpu_has_avx2():
try:
import cpuinfo
@ -104,44 +116,13 @@ def torch_version():
return torver
def update_pytorch_and_python():
print_big_message("Checking for PyTorch updates.")
# Update the Python version. Left here for future reference in case this becomes necessary.
# print_big_message("Checking for PyTorch and Python updates.")
# current_python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
# if current_python_version != PYTHON_VERSION:
# run_cmd(f"conda install -y python={PYTHON_VERSION}", assert_success=True, environment=True)
torver = torch_version()
base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}"
if "+cu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
elif "+rocm" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
elif "+cpu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
elif "+cxx11" in torver:
intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
install_cmd = f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
else:
install_cmd = base_cmd
run_cmd(install_cmd, assert_success=True, environment=True)
def get_current_commit():
result = run_cmd("git rev-parse HEAD", capture_output=True, environment=True)
return result.stdout.decode('utf-8').strip()
def is_installed():
site_packages_path = None
for sitedir in site.getsitepackages():
if "site-packages" in sitedir and conda_env_path in sitedir:
site_packages_path = sitedir
break
if site_packages_path:
return os.path.isfile(os.path.join(site_packages_path, 'torch', '__init__.py'))
else:
return os.path.isdir(conda_env_path)
def get_extensions_names():
return [foldername for foldername in os.listdir('extensions') if os.path.isfile(os.path.join('extensions', foldername, 'requirements.txt'))]
def check_env():
@ -157,35 +138,11 @@ def check_env():
sys.exit(1)
def get_current_commit():
result = run_cmd("git rev-parse HEAD", capture_output=True, environment=True)
return result.stdout.decode('utf-8').strip()
def clear_cache():
run_cmd("conda clean -a -y", environment=True)
run_cmd("python -m pip cache purge", environment=True)
def print_big_message(message):
message = message.strip()
lines = message.split('\n')
print("\n\n*******************************************************************")
for line in lines:
print("*", line)
print("*******************************************************************\n\n")
def calculate_file_hash(file_path):
p = os.path.join(script_dir, file_path)
if os.path.isfile(p):
with open(p, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
else:
return ''
def run_cmd(cmd, assert_success=False, environment=False, capture_output=False, env=None):
# Use the conda environment
if environment:
@ -210,6 +167,25 @@ def run_cmd(cmd, assert_success=False, environment=False, capture_output=False,
return result
def print_big_message(message):
message = message.strip()
lines = message.split('\n')
print("\n\n*******************************************************************")
for line in lines:
print("*", line)
print("*******************************************************************\n\n")
def calculate_file_hash(file_path):
p = os.path.join(script_dir, file_path)
if os.path.isfile(p):
with open(p, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
else:
return ''
def generate_alphabetic_sequence(index):
result = ''
while index >= 0:
@ -238,6 +214,51 @@ def get_user_choice(question, options_dict):
return choice
def update_pytorch_and_python():
print_big_message("Checking for PyTorch updates.")
# Update the Python version. Left here for future reference in case this becomes necessary.
# print_big_message("Checking for PyTorch and Python updates.")
# current_python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
# if current_python_version != PYTHON_VERSION:
# run_cmd(f"conda install -y python={PYTHON_VERSION}", assert_success=True, environment=True)
torver = torch_version()
base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}"
if "+cu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
elif "+rocm" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
elif "+cpu" in torver:
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
elif "+cxx11" in torver:
intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
install_cmd = f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
else:
install_cmd = base_cmd
run_cmd(install_cmd, assert_success=True, environment=True)
def clean_outdated_pytorch_cuda_dependencies():
patterns = ["cu121", "cu122", "torch2.4"]
result = run_cmd("python -m pip list --format=freeze", capture_output=True, environment=True)
matching_packages = []
for line in result.stdout.decode('utf-8').splitlines():
if "==" in line:
pkg_name, version = line.split('==', 1)
if any(pattern in version for pattern in patterns):
matching_packages.append(pkg_name)
if matching_packages:
print(f"\nUninstalling: {', '.join(matching_packages)}\n")
run_cmd(f"python -m pip uninstall -y {' '.join(matching_packages)}", assert_success=True, environment=True)
return matching_packages
def install_webui():
if os.path.isfile(state_file):
os.remove(state_file)
@ -323,37 +344,6 @@ def install_webui():
update_requirements(initial_installation=True, pull=False)
def get_extensions_names():
return [foldername for foldername in os.listdir('extensions') if os.path.isfile(os.path.join('extensions', foldername, 'requirements.txt'))]
def install_extensions_requirements():
print_big_message("Installing extensions requirements.\nSome of these may fail on Windows.\nDon\'t worry if you see error messages, as they will not affect the main program.")
extensions = get_extensions_names()
for i, extension in enumerate(extensions):
print(f"\n\n--- [{i + 1}/{len(extensions)}]: {extension}\n\n")
extension_req_path = os.path.join("extensions", extension, "requirements.txt")
run_cmd(f"python -m pip install -r {extension_req_path} --upgrade", assert_success=False, environment=True)
def clean_outdated_pytorch_cuda_dependencies():
patterns = ["cu121", "cu122", "torch2.4"]
result = run_cmd("python -m pip list --format=freeze", capture_output=True, environment=True)
matching_packages = []
for line in result.stdout.decode('utf-8').splitlines():
if "==" in line:
pkg_name, version = line.split('==', 1)
if any(pattern in version for pattern in patterns):
matching_packages.append(pkg_name)
if matching_packages:
print(f"\nUninstalling: {', '.join(matching_packages)}\n")
run_cmd(f"python -m pip uninstall -y {' '.join(matching_packages)}", assert_success=True, environment=True)
return matching_packages
def update_requirements(initial_installation=False, pull=True):
# Create .git directory if missing
if not os.path.exists(os.path.join(script_dir, ".git")):
@ -366,14 +356,18 @@ def update_requirements(initial_installation=False, pull=True):
)
torver = torch_version()
requirements_base = os.path.join("requirements", "full")
if "+rocm" in torver:
requirements_file = "requirements_amd" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt"
file_name = f"requirements_amd{'_noavx2' if not cpu_has_avx2() else ''}.txt"
elif "+cpu" in torver or "+cxx11" in torver:
requirements_file = "requirements_cpu_only" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt"
file_name = f"requirements_cpu_only{'_noavx2' if not cpu_has_avx2() else ''}.txt"
elif is_macos():
requirements_file = "requirements_apple_" + ("intel" if is_x86_64() else "silicon") + ".txt"
file_name = f"requirements_apple_{'intel' if is_x86_64() else 'silicon'}.txt"
else:
requirements_file = "requirements" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt"
file_name = f"requirements{'_noavx2' if not cpu_has_avx2() else ''}.txt"
requirements_file = os.path.join(requirements_base, file_name)
# Load state from JSON file
current_commit = get_current_commit()
@ -475,6 +469,15 @@ def update_requirements(initial_installation=False, pull=True):
clear_cache()
def install_extensions_requirements():
print_big_message("Installing extensions requirements.\nSome of these may fail on Windows.\nDon\'t worry if you see error messages, as they will not affect the main program.")
extensions = get_extensions_names()
for i, extension in enumerate(extensions):
print(f"\n\n--- [{i + 1}/{len(extensions)}]: {extension}\n\n")
extension_req_path = os.path.join("extensions", extension, "requirements.txt")
run_cmd(f"python -m pip install -r {extension_req_path} --upgrade", assert_success=False, environment=True)
def launch_webui():
run_cmd(f"python server.py {flags}", environment=True)

View file

@ -7,7 +7,6 @@ fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numba==0.59.*
numpy==1.26.*
pandas
peft==0.15.*
@ -26,14 +25,13 @@ tqdm
wandb
# API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numba==0.59.*
numpy==1.26.*
pandas
peft==0.15.*
@ -25,12 +24,11 @@ tqdm
wandb
# API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/rocm/llama_cpp_binaries-0.2.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numba==0.59.*
numpy==1.26.*
pandas
peft==0.15.*
@ -25,12 +24,11 @@ tqdm
wandb
# API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/rocm/llama_cpp_binaries-0.2.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numba==0.59.*
numpy==1.26.*
pandas
peft==0.15.*
@ -25,14 +24,12 @@ tqdm
wandb
# API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numba==0.59.*
numpy==1.26.*
pandas
peft==0.15.*
@ -25,13 +24,13 @@ tqdm
wandb
# API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numba==0.59.*
numpy==1.26.*
pandas
peft==0.15.*
@ -25,11 +24,10 @@ tqdm
wandb
# API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numba==0.59.*
numpy==1.26.*
pandas
peft==0.15.*
@ -25,11 +24,10 @@ tqdm
wandb
# API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -7,7 +7,6 @@ fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numba==0.59.*
numpy==1.26.*
pandas
peft==0.15.*
@ -26,14 +25,13 @@ tqdm
wandb
# API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -6,7 +6,6 @@ fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numba==0.59.*
numpy==1.26.*
pandas
peft==0.15.*
@ -25,7 +24,6 @@ tqdm
wandb
# API
SpeechRecognition==3.10.0
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken

View file

@ -0,0 +1,19 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -0,0 +1,18 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -0,0 +1,18 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -0,0 +1,19 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"

View file

@ -0,0 +1,20 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"

View file

@ -0,0 +1,19 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -0,0 +1,19 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -0,0 +1,19 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"

View file

@ -0,0 +1,15 @@
fastapi==0.112.4
gradio==4.37.*
jinja2==3.1.6
markdown
numpy==1.26.*
pydantic==2.8.2
pyyaml
requests
rich
tqdm
# API
flask_cloudflared==0.0.14
sse-starlette==1.6.5
tiktoken

View file

@ -1,11 +1,8 @@
import os
import warnings
from modules import shared
import accelerate # This early import makes Intel GPUs happy
import modules.one_click_installer_check
from modules import shared
from modules.block_requests import OpenMonkeyPatch, RequestBlocker
from modules.logging_colors import logger
@ -38,7 +35,6 @@ import yaml
import modules.extensions as extensions_module
from modules import (
chat,
training,
ui,
ui_chat,
@ -89,7 +85,7 @@ def create_interface():
# Force some events to be triggered on page load
shared.persistent_interface_state.update({
'loader': shared.args.loader or 'Transformers',
'loader': shared.args.loader or 'llama.cpp',
'mode': shared.settings['mode'] if shared.settings['mode'] == 'instruct' else gr.update(),
'character_menu': shared.args.character or shared.settings['character'],
'instruction_template_str': shared.settings['instruction_template_str'],
@ -218,10 +214,28 @@ if __name__ == "__main__":
if extension not in shared.args.extensions:
shared.args.extensions.append(extension)
available_models = utils.get_available_models()
# Model defined through --model
if shared.args.model is not None:
shared.model_name = shared.args.model
# Select the model from a command-line menu
elif shared.args.model_menu:
if len(available_models) == 0:
logger.error('No models are available! Please download at least one.')
sys.exit(0)
else:
print('The following models are available:\n')
for i, model in enumerate(available_models):
print(f'{i+1}. {model}')
print(f'\nWhich one do you want to load? 1-{len(available_models)}\n')
i = int(input()) - 1
print()
shared.model_name = available_models[i]
# If any model has been selected, load it
if shared.model_name != 'None':
p = Path(shared.model_name)

View file

@ -2,6 +2,12 @@
cd "$(dirname "${BASH_SOURCE[0]}")"
# Portable install case
if [ -d "portable_env" ]; then
./portable_env/bin/python3 server.py --api --auto-launch "$@"
exit $?
fi
if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
# deactivate existing conda envs as needed to avoid conflicts

View file

@ -2,6 +2,12 @@
cd "$(dirname "${BASH_SOURCE[0]}")"
# Portable install case
if [ -d "portable_env" ]; then
./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@"
exit $?
fi
if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
# deactivate existing conda envs as needed to avoid conflicts

View file

@ -3,6 +3,12 @@ setlocal enabledelayedexpansion
cd /D "%~dp0"
@rem Portable install case
if exist "portable_env" (
.\portable_env\python.exe server.py --api --auto-launch %*
exit /b %errorlevel%
)
set PATH=%PATH%;%SystemRoot%\system32
echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end