mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-06-07 14:17:09 -04:00
commit
a778270536
48 changed files with 1292 additions and 705 deletions
10
.github/dependabot.yml
vendored
10
.github/dependabot.yml
vendored
|
@ -5,8 +5,14 @@
|
||||||
|
|
||||||
version: 2
|
version: 2
|
||||||
updates:
|
updates:
|
||||||
- package-ecosystem: "pip" # See documentation for possible values
|
- package-ecosystem: "pip"
|
||||||
directory: "/" # Location of package manifests
|
directory: "/requirements/full/"
|
||||||
|
target-branch: "dev"
|
||||||
|
schedule:
|
||||||
|
interval: "weekly"
|
||||||
|
|
||||||
|
- package-ecosystem: "pip"
|
||||||
|
directory: "/requirements/portable/"
|
||||||
target-branch: "dev"
|
target-branch: "dev"
|
||||||
schedule:
|
schedule:
|
||||||
interval: "weekly"
|
interval: "weekly"
|
||||||
|
|
49
.github/workflows/build-everything-tgw.yml
vendored
Normal file
49
.github/workflows/build-everything-tgw.yml
vendored
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
name: Build Everything TGW
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
version:
|
||||||
|
description: 'Version tag of text-generation-webui to build: v3.0'
|
||||||
|
default: 'v3.0'
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build_release_cuda_windows:
|
||||||
|
name: CUDA Windows
|
||||||
|
uses: ./.github/workflows/build-portable-release-cuda.yml
|
||||||
|
with:
|
||||||
|
version: ${{ inputs.version }}
|
||||||
|
config: 'os:windows-2019'
|
||||||
|
|
||||||
|
build_release_cuda_linux:
|
||||||
|
name: CUDA Linux
|
||||||
|
uses: ./.github/workflows/build-portable-release-cuda.yml
|
||||||
|
with:
|
||||||
|
version: ${{ inputs.version }}
|
||||||
|
config: 'os:ubuntu-22.04'
|
||||||
|
|
||||||
|
build_release_cpu_windows:
|
||||||
|
name: CPU Windows
|
||||||
|
uses: ./.github/workflows/build-portable-release.yml
|
||||||
|
with:
|
||||||
|
version: ${{ inputs.version }}
|
||||||
|
config: 'os:windows-2019'
|
||||||
|
|
||||||
|
build_release_cpu_linux:
|
||||||
|
name: CPU Linux
|
||||||
|
uses: ./.github/workflows/build-portable-release.yml
|
||||||
|
with:
|
||||||
|
version: ${{ inputs.version }}
|
||||||
|
config: 'os:ubuntu-22.04'
|
||||||
|
|
||||||
|
build_release_macos:
|
||||||
|
name: macOS
|
||||||
|
uses: ./.github/workflows/build-portable-release.yml
|
||||||
|
with:
|
||||||
|
version: ${{ inputs.version }}
|
||||||
|
config: 'os:macos-13,macos-14'
|
183
.github/workflows/build-portable-release-cuda.yml
vendored
Normal file
183
.github/workflows/build-portable-release-cuda.yml
vendored
Normal file
|
@ -0,0 +1,183 @@
|
||||||
|
name: Build CUDA
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
version:
|
||||||
|
description: 'Version tag of text-generation-webui to build: v3.0'
|
||||||
|
default: 'v3.0'
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
config:
|
||||||
|
description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
|
||||||
|
default: 'Default'
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
exclude:
|
||||||
|
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
|
||||||
|
default: 'None'
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
version:
|
||||||
|
description: 'Version tag of text-generation-webui to build: v3.0'
|
||||||
|
default: 'v3.0'
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
config:
|
||||||
|
description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
|
||||||
|
default: 'Default'
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
exclude:
|
||||||
|
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
|
||||||
|
default: 'None'
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
define_matrix:
|
||||||
|
name: Define Build Matrix
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: pwsh
|
||||||
|
env:
|
||||||
|
CONFIGIN: ${{ inputs.config }}
|
||||||
|
EXCLUDEIN: ${{ inputs.exclude }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Define Job Output
|
||||||
|
id: set-matrix
|
||||||
|
run: |
|
||||||
|
$matrix = @{
|
||||||
|
'os' = @('ubuntu-22.04', 'windows-2019')
|
||||||
|
'pyver' = @("3.11")
|
||||||
|
'avx' = @("AVX2")
|
||||||
|
'cuda' = @("11.7", "12.4")
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
|
||||||
|
|
||||||
|
if ($env:EXCLUDEIN -ne 'None') {
|
||||||
|
$exclusions = @()
|
||||||
|
$exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
|
||||||
|
$matrix['exclude'] = $exclusions
|
||||||
|
}
|
||||||
|
|
||||||
|
$matrixOut = ConvertTo-Json $matrix -Compress
|
||||||
|
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
|
||||||
|
|
||||||
|
build_wheels:
|
||||||
|
name: ${{ matrix.os }} ${{ matrix.pyver }} CPU ${{ matrix.avx }} CUDA ${{ matrix.cuda }}
|
||||||
|
needs: define_matrix
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: pwsh
|
||||||
|
env:
|
||||||
|
AVXVER: ${{ matrix.avx }}
|
||||||
|
PCKGVER: ${{ inputs.version }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
repository: 'oobabooga/text-generation-webui'
|
||||||
|
ref: ${{ inputs.version }}
|
||||||
|
submodules: 'recursive'
|
||||||
|
|
||||||
|
- uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.pyver }}
|
||||||
|
|
||||||
|
- name: Build Package
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
|
||||||
|
|
||||||
|
# Define common variables
|
||||||
|
CUDA_VERSION="${{ matrix.cuda }}"
|
||||||
|
AVX_SUPPORT="${{ matrix.avx }}"
|
||||||
|
VERSION="${{ inputs.version }}"
|
||||||
|
|
||||||
|
# 1. Set platform-specific variables
|
||||||
|
if [[ "$RUNNER_OS" == "Windows" ]]; then
|
||||||
|
PLATFORM="windows"
|
||||||
|
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-pc-windows-msvc-install_only.tar.gz"
|
||||||
|
PIP_PATH="portable_env/python.exe -m pip"
|
||||||
|
PACKAGES_PATH="portable_env/Lib/site-packages"
|
||||||
|
ZIP_CMD="powershell -Command \"Compress-Archive -Path text-generation-webui -DestinationPath"
|
||||||
|
rm start_linux.sh start_macos.sh
|
||||||
|
else
|
||||||
|
PLATFORM="linux"
|
||||||
|
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-unknown-linux-gnu-install_only.tar.gz"
|
||||||
|
PIP_PATH="portable_env/bin/python -m pip"
|
||||||
|
PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
|
||||||
|
ZIP_CMD="zip -r"
|
||||||
|
rm start_macos.sh start_windows.bat
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 2. Download and extract Python
|
||||||
|
cd ..
|
||||||
|
echo "Downloading Python for $PLATFORM..."
|
||||||
|
curl -L -o python-build.tar.gz "$PYTHON_URL"
|
||||||
|
tar -xzf python-build.tar.gz
|
||||||
|
mv python text-generation-webui/portable_env
|
||||||
|
|
||||||
|
# 3. Prepare requirements file based on AVX and CUDA
|
||||||
|
if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
|
||||||
|
BASE_REQ_FILE="requirements/portable/requirements.txt"
|
||||||
|
else
|
||||||
|
BASE_REQ_FILE="requirements/portable/requirements_noavx2.txt"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Create CUDA-specific requirements file if needed
|
||||||
|
cd text-generation-webui
|
||||||
|
if [[ "$CUDA_VERSION" == "11.7" ]]; then
|
||||||
|
echo "Creating CUDA 11.7 specific requirements file"
|
||||||
|
sed 's/cu124/cu117/g' "$BASE_REQ_FILE" > requirements_cuda_temp.txt
|
||||||
|
REQ_FILE="requirements_cuda_temp.txt"
|
||||||
|
else
|
||||||
|
REQ_FILE="$BASE_REQ_FILE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 4. Install packages
|
||||||
|
echo "Installing Python packages from $REQ_FILE..."
|
||||||
|
$PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
|
||||||
|
|
||||||
|
# 5. Clean up
|
||||||
|
if [[ "$CUDA_VERSION" == "11.7" ]]; then
|
||||||
|
rm requirements_cuda_temp.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 6. Create ZIP file
|
||||||
|
cd ..
|
||||||
|
VERSION_CLEAN="${VERSION#v}"
|
||||||
|
ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
|
||||||
|
echo "Creating archive: $ZIP_NAME"
|
||||||
|
|
||||||
|
if [[ "$RUNNER_OS" == "Windows" ]]; then
|
||||||
|
powershell -Command "Compress-Archive -Path text-generation-webui -DestinationPath $ZIP_NAME"
|
||||||
|
else
|
||||||
|
zip -r "$ZIP_NAME" text-generation-webui
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Upload files to a GitHub release
|
||||||
|
id: upload-release
|
||||||
|
uses: svenstaro/upload-release-action@2.7.0
|
||||||
|
continue-on-error: true
|
||||||
|
with:
|
||||||
|
repo_token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
file: ../textgen-portable-${{ inputs.version }}*.zip
|
||||||
|
tag: ${{ inputs.version }}
|
||||||
|
file_glob: true
|
||||||
|
make_latest: false
|
||||||
|
overwrite: true
|
193
.github/workflows/build-portable-release.yml
vendored
Normal file
193
.github/workflows/build-portable-release.yml
vendored
Normal file
|
@ -0,0 +1,193 @@
|
||||||
|
name: Build CPU and macOS
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
version:
|
||||||
|
description: 'Version tag of text-generation-webui to build: v3.0'
|
||||||
|
default: 'v3.0'
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
config:
|
||||||
|
description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
|
||||||
|
default: 'Default'
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
exclude:
|
||||||
|
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
|
||||||
|
default: 'None'
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
version:
|
||||||
|
description: 'Version tag of text-generation-webui to build: v3.0'
|
||||||
|
default: 'v3.0'
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
config:
|
||||||
|
description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
|
||||||
|
default: 'Default'
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
exclude:
|
||||||
|
description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
|
||||||
|
default: 'None'
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: write
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
define_matrix:
|
||||||
|
name: Define Build Matrix
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: pwsh
|
||||||
|
env:
|
||||||
|
CONFIGIN: ${{ inputs.config }}
|
||||||
|
EXCLUDEIN: ${{ inputs.exclude }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Define Job Output
|
||||||
|
id: set-matrix
|
||||||
|
run: |
|
||||||
|
$matrix = @{
|
||||||
|
'os' = @('ubuntu-22.04', 'windows-2019', 'macos-13', 'macos-14')
|
||||||
|
'pyver' = @("3.11")
|
||||||
|
'avx' = @("AVX2")
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
|
||||||
|
|
||||||
|
if ($env:EXCLUDEIN -ne 'None') {
|
||||||
|
$exclusions = @()
|
||||||
|
$exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
|
||||||
|
$matrix['exclude'] = $exclusions
|
||||||
|
}
|
||||||
|
|
||||||
|
$matrixOut = ConvertTo-Json $matrix -Compress
|
||||||
|
Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
|
||||||
|
|
||||||
|
build_wheels:
|
||||||
|
name: ${{ matrix.os }} ${{ matrix.pyver }} CPU ${{ matrix.avx }}
|
||||||
|
needs: define_matrix
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
shell: pwsh
|
||||||
|
env:
|
||||||
|
AVXVER: ${{ matrix.avx }}
|
||||||
|
PCKGVER: ${{ inputs.version }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
repository: 'oobabooga/text-generation-webui'
|
||||||
|
ref: ${{ inputs.version }}
|
||||||
|
submodules: 'recursive'
|
||||||
|
|
||||||
|
- uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.pyver }}
|
||||||
|
|
||||||
|
- name: Build Package
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
|
||||||
|
|
||||||
|
# Define common variables
|
||||||
|
AVX_SUPPORT="${{ matrix.avx }}"
|
||||||
|
VERSION="${{ inputs.version }}"
|
||||||
|
OS_TYPE="${{ matrix.os }}"
|
||||||
|
|
||||||
|
# 1. Set platform-specific variables
|
||||||
|
if [[ "$RUNNER_OS" == "Windows" ]]; then
|
||||||
|
PLATFORM="windows-cpu"
|
||||||
|
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-pc-windows-msvc-install_only.tar.gz"
|
||||||
|
PIP_PATH="portable_env/python.exe -m pip"
|
||||||
|
PACKAGES_PATH="portable_env/Lib/site-packages"
|
||||||
|
rm start_linux.sh start_macos.sh
|
||||||
|
elif [[ "$RUNNER_OS" == "macOS" ]]; then
|
||||||
|
if [[ "$OS_TYPE" == "macos-13" ]]; then
|
||||||
|
PLATFORM="macos-x86_64"
|
||||||
|
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-apple-darwin-install_only.tar.gz"
|
||||||
|
REQ_TYPE="apple_intel"
|
||||||
|
else
|
||||||
|
PLATFORM="macos-arm64"
|
||||||
|
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-aarch64-apple-darwin-install_only.tar.gz"
|
||||||
|
REQ_TYPE="apple_silicon"
|
||||||
|
fi
|
||||||
|
PIP_PATH="portable_env/bin/python -m pip"
|
||||||
|
PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
|
||||||
|
rm start_linux.sh start_windows.bat
|
||||||
|
else
|
||||||
|
# Linux case
|
||||||
|
PLATFORM="linux-cpu"
|
||||||
|
PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-unknown-linux-gnu-install_only.tar.gz"
|
||||||
|
PIP_PATH="portable_env/bin/python -m pip"
|
||||||
|
PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
|
||||||
|
rm start_macos.sh start_windows.bat
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 2. Download and extract Python
|
||||||
|
echo "Downloading Python for $PLATFORM..."
|
||||||
|
cd ..
|
||||||
|
curl -L -o python-build.tar.gz "$PYTHON_URL"
|
||||||
|
tar -xzf python-build.tar.gz
|
||||||
|
mv python text-generation-webui/portable_env
|
||||||
|
|
||||||
|
# 3. Prepare requirements file based on platform and AVX
|
||||||
|
cd text-generation-webui
|
||||||
|
|
||||||
|
# Select requirements file based on platform
|
||||||
|
if [[ "$RUNNER_OS" == "macOS" ]]; then
|
||||||
|
if [[ "$OS_TYPE" == "macos-13" ]]; then
|
||||||
|
REQ_FILE="requirements/portable/requirements_apple_intel.txt"
|
||||||
|
else
|
||||||
|
REQ_FILE="requirements/portable/requirements_apple_silicon.txt"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
# For Windows and Linux, check AVX support
|
||||||
|
if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
|
||||||
|
REQ_FILE="requirements/portable/requirements_cpu_only.txt"
|
||||||
|
else
|
||||||
|
REQ_FILE="requirements/portable/requirements_cpu_only_noavx2.txt"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Using requirements file: $REQ_FILE"
|
||||||
|
|
||||||
|
# 4. Install packages
|
||||||
|
echo "Installing Python packages from $REQ_FILE..."
|
||||||
|
$PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
|
||||||
|
|
||||||
|
# 5. Create ZIP file
|
||||||
|
cd ..
|
||||||
|
VERSION_CLEAN="${VERSION#v}"
|
||||||
|
ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}.zip"
|
||||||
|
echo "Creating archive: $ZIP_NAME"
|
||||||
|
|
||||||
|
if [[ "$RUNNER_OS" == "Windows" ]]; then
|
||||||
|
powershell -Command "Compress-Archive -Path text-generation-webui -DestinationPath $ZIP_NAME"
|
||||||
|
else
|
||||||
|
zip -r "$ZIP_NAME" text-generation-webui
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Upload files to a GitHub release
|
||||||
|
id: upload-release
|
||||||
|
uses: svenstaro/upload-release-action@2.7.0
|
||||||
|
continue-on-error: true
|
||||||
|
with:
|
||||||
|
repo_token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
file: ../textgen-portable-${{ inputs.version }}*.zip
|
||||||
|
tag: ${{ inputs.version }}
|
||||||
|
file_glob: true
|
||||||
|
make_latest: false
|
||||||
|
overwrite: true
|
12
README.md
12
README.md
|
@ -27,6 +27,14 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
|
||||||
|
|
||||||
## How to install
|
## How to install
|
||||||
|
|
||||||
|
#### Option 1: Portable builds
|
||||||
|
|
||||||
|
Compatible with GGUF (llama.cpp) models, just unzip and run, no installation. Available for Windows, Linux, and macOS.
|
||||||
|
|
||||||
|
Download from: https://github.com/oobabooga/text-generation-webui/releases
|
||||||
|
|
||||||
|
#### Option 2: One-click installer
|
||||||
|
|
||||||
1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip).
|
1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip).
|
||||||
2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat`.
|
2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat`.
|
||||||
3) Select your GPU vendor when asked.
|
3) Select your GPU vendor when asked.
|
||||||
|
@ -352,6 +360,10 @@ Run `python download-model.py --help` to see all the options.
|
||||||
|
|
||||||
https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/main/Colab-TextGen-GPU.ipynb
|
https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/main/Colab-TextGen-GPU.ipynb
|
||||||
|
|
||||||
|
## Community
|
||||||
|
|
||||||
|
https://www.reddit.com/r/Oobabooga/
|
||||||
|
|
||||||
## Acknowledgment
|
## Acknowledgment
|
||||||
|
|
||||||
In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.
|
In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.
|
||||||
|
|
|
@ -7,10 +7,7 @@ from io import BytesIO
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import tiktoken
|
import tiktoken
|
||||||
import torch
|
|
||||||
import torch.nn.functional as F
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from transformers import LogitsProcessor, LogitsProcessorList
|
|
||||||
|
|
||||||
from extensions.openai.errors import InvalidRequestError
|
from extensions.openai.errors import InvalidRequestError
|
||||||
from extensions.openai.utils import debug_msg
|
from extensions.openai.utils import debug_msg
|
||||||
|
@ -22,54 +19,7 @@ from modules.chat import (
|
||||||
load_instruction_template_memoized
|
load_instruction_template_memoized
|
||||||
)
|
)
|
||||||
from modules.presets import load_preset_memoized
|
from modules.presets import load_preset_memoized
|
||||||
from modules.text_generation import (
|
from modules.text_generation import decode, encode, generate_reply
|
||||||
decode,
|
|
||||||
encode,
|
|
||||||
generate_reply,
|
|
||||||
get_reply_from_output_ids
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class LogitsBiasProcessor(LogitsProcessor):
|
|
||||||
def __init__(self, logit_bias={}):
|
|
||||||
self.logit_bias = logit_bias
|
|
||||||
if self.logit_bias:
|
|
||||||
self.keys = list([int(key) for key in self.logit_bias.keys()])
|
|
||||||
values = [self.logit_bias[str(key)] for key in self.keys]
|
|
||||||
self.values = torch.tensor(values, dtype=torch.float, device=shared.model.device)
|
|
||||||
debug_msg(f"{self})")
|
|
||||||
|
|
||||||
def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
|
|
||||||
if self.logit_bias:
|
|
||||||
debug_msg(logits[0, self.keys], " + ", self.values)
|
|
||||||
logits[0, self.keys] += self.values
|
|
||||||
debug_msg(" --> ", logits[0, self.keys])
|
|
||||||
debug_msg(" max/min ", float(torch.max(logits[0])), float(torch.min(logits[0])))
|
|
||||||
|
|
||||||
return logits
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"<{self.__class__.__name__}(logit_bias={self.logit_bias})>"
|
|
||||||
|
|
||||||
|
|
||||||
class LogprobProcessor(LogitsProcessor):
|
|
||||||
def __init__(self, logprobs=None):
|
|
||||||
self.logprobs = logprobs
|
|
||||||
self.token_alternatives = {}
|
|
||||||
|
|
||||||
def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
|
|
||||||
if self.logprobs is not None: # 0-5
|
|
||||||
log_e_probabilities = F.log_softmax(logits, dim=1)
|
|
||||||
top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs + 1)
|
|
||||||
top_tokens = [get_reply_from_output_ids([tok]) for tok in top_indices[0]]
|
|
||||||
top_probs = [float(x) for x in top_values[0]]
|
|
||||||
self.token_alternatives = dict(zip(top_tokens, top_probs))
|
|
||||||
debug_msg(repr(self))
|
|
||||||
|
|
||||||
return logits
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"<{self.__class__.__name__}(logprobs={self.logprobs}, token_alternatives={self.token_alternatives})>"
|
|
||||||
|
|
||||||
|
|
||||||
def convert_logprobs_to_tiktoken(model, logprobs):
|
def convert_logprobs_to_tiktoken(model, logprobs):
|
||||||
|
@ -107,21 +57,29 @@ def process_parameters(body, is_legacy=False):
|
||||||
elif isinstance(body['stop'], list):
|
elif isinstance(body['stop'], list):
|
||||||
generate_params['custom_stopping_strings'] = body['stop']
|
generate_params['custom_stopping_strings'] = body['stop']
|
||||||
|
|
||||||
logits_processor = []
|
if shared.args.loader != 'llama.cpp':
|
||||||
logit_bias = body.get('logit_bias', None)
|
from transformers import LogitsProcessorList
|
||||||
if logit_bias: # {str: float, ...}
|
|
||||||
logits_processor = [LogitsBiasProcessor(logit_bias)]
|
|
||||||
|
|
||||||
logprobs = None # coming to chat eventually
|
from modules.transformers_loader import (
|
||||||
if 'logprobs' in body:
|
LogitsBiasProcessor,
|
||||||
logprobs = body.get('logprobs', 0) # maybe cap at topk? don't clamp 0-5.
|
LogprobProcessor
|
||||||
generate_params['logprob_proc'] = LogprobProcessor(logprobs)
|
)
|
||||||
logits_processor.extend([generate_params['logprob_proc']])
|
|
||||||
else:
|
|
||||||
logprobs = None
|
|
||||||
|
|
||||||
if logits_processor: # requires logits_processor support
|
logits_processor = []
|
||||||
generate_params['logits_processor'] = LogitsProcessorList(logits_processor)
|
logit_bias = body.get('logit_bias', None)
|
||||||
|
if logit_bias: # {str: float, ...}
|
||||||
|
logits_processor = [LogitsBiasProcessor(logit_bias)]
|
||||||
|
|
||||||
|
logprobs = None # coming to chat eventually
|
||||||
|
if 'logprobs' in body:
|
||||||
|
logprobs = body.get('logprobs', 0) # maybe cap at topk? don't clamp 0-5.
|
||||||
|
generate_params['logprob_proc'] = LogprobProcessor(logprobs)
|
||||||
|
logits_processor.extend([generate_params['logprob_proc']])
|
||||||
|
else:
|
||||||
|
logprobs = None
|
||||||
|
|
||||||
|
if logits_processor: # requires logits_processor support
|
||||||
|
generate_params['logits_processor'] = LogitsProcessorList(logits_processor)
|
||||||
|
|
||||||
return generate_params
|
return generate_params
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,6 @@ import traceback
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
|
|
||||||
import speech_recognition as sr
|
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import Depends, FastAPI, Header, HTTPException
|
from fastapi import Depends, FastAPI, Header, HTTPException
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
@ -16,11 +15,9 @@ from pydub import AudioSegment
|
||||||
from sse_starlette import EventSourceResponse
|
from sse_starlette import EventSourceResponse
|
||||||
|
|
||||||
import extensions.openai.completions as OAIcompletions
|
import extensions.openai.completions as OAIcompletions
|
||||||
import extensions.openai.embeddings as OAIembeddings
|
|
||||||
import extensions.openai.images as OAIimages
|
import extensions.openai.images as OAIimages
|
||||||
import extensions.openai.logits as OAIlogits
|
import extensions.openai.logits as OAIlogits
|
||||||
import extensions.openai.models as OAImodels
|
import extensions.openai.models as OAImodels
|
||||||
import extensions.openai.moderations as OAImoderations
|
|
||||||
from extensions.openai.errors import ServiceUnavailableError
|
from extensions.openai.errors import ServiceUnavailableError
|
||||||
from extensions.openai.tokens import token_count, token_decode, token_encode
|
from extensions.openai.tokens import token_count, token_decode, token_encode
|
||||||
from extensions.openai.utils import _start_cloudflared
|
from extensions.openai.utils import _start_cloudflared
|
||||||
|
@ -165,6 +162,8 @@ def handle_billing_usage():
|
||||||
|
|
||||||
@app.post('/v1/audio/transcriptions', dependencies=check_key)
|
@app.post('/v1/audio/transcriptions', dependencies=check_key)
|
||||||
async def handle_audio_transcription(request: Request):
|
async def handle_audio_transcription(request: Request):
|
||||||
|
import speech_recognition as sr
|
||||||
|
|
||||||
r = sr.Recognizer()
|
r = sr.Recognizer()
|
||||||
|
|
||||||
form = await request.form()
|
form = await request.form()
|
||||||
|
@ -211,6 +210,8 @@ async def handle_image_generation(request: Request):
|
||||||
|
|
||||||
@app.post("/v1/embeddings", response_model=EmbeddingsResponse, dependencies=check_key)
|
@app.post("/v1/embeddings", response_model=EmbeddingsResponse, dependencies=check_key)
|
||||||
async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
|
async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
|
||||||
|
import extensions.openai.embeddings as OAIembeddings
|
||||||
|
|
||||||
input = request_data.input
|
input = request_data.input
|
||||||
if not input:
|
if not input:
|
||||||
raise HTTPException(status_code=400, detail="Missing required argument input")
|
raise HTTPException(status_code=400, detail="Missing required argument input")
|
||||||
|
@ -224,6 +225,8 @@ async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
|
||||||
|
|
||||||
@app.post("/v1/moderations", dependencies=check_key)
|
@app.post("/v1/moderations", dependencies=check_key)
|
||||||
async def handle_moderations(request: Request):
|
async def handle_moderations(request: Request):
|
||||||
|
import extensions.openai.moderations as OAImoderations
|
||||||
|
|
||||||
body = await request.json()
|
body = await request.json()
|
||||||
input = body["input"]
|
input = body["input"]
|
||||||
if not input:
|
if not input:
|
||||||
|
|
|
@ -2,7 +2,6 @@ from pathlib import Path
|
||||||
|
|
||||||
import modules.shared as shared
|
import modules.shared as shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
from modules.models import get_device
|
|
||||||
|
|
||||||
|
|
||||||
def add_lora_to_model(lora_names):
|
def add_lora_to_model(lora_names):
|
||||||
|
@ -47,9 +46,10 @@ def add_lora_exllamav2(lora_names):
|
||||||
|
|
||||||
|
|
||||||
def add_lora_transformers(lora_names):
|
def add_lora_transformers(lora_names):
|
||||||
|
|
||||||
from peft import PeftModel
|
from peft import PeftModel
|
||||||
|
|
||||||
|
from modules.torch_utils import get_device
|
||||||
|
|
||||||
prior_set = set(shared.lora_names)
|
prior_set = set(shared.lora_names)
|
||||||
added_set = set(lora_names) - prior_set
|
added_set = set(lora_names) - prior_set
|
||||||
removed_set = prior_set - set(lora_names)
|
removed_set = prior_set - set(lora_names)
|
||||||
|
|
|
@ -2,9 +2,6 @@ import traceback
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
|
|
||||||
import torch
|
|
||||||
import transformers
|
|
||||||
|
|
||||||
import modules.shared as shared
|
import modules.shared as shared
|
||||||
|
|
||||||
|
|
||||||
|
@ -12,25 +9,6 @@ class StopNowException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
|
|
||||||
def __init__(self):
|
|
||||||
transformers.StoppingCriteria.__init__(self)
|
|
||||||
|
|
||||||
def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
|
|
||||||
return shared.stop_everything
|
|
||||||
|
|
||||||
|
|
||||||
class Stream(transformers.StoppingCriteria):
|
|
||||||
def __init__(self, callback_func=None):
|
|
||||||
self.callback_func = callback_func
|
|
||||||
|
|
||||||
def __call__(self, input_ids, scores) -> bool:
|
|
||||||
if self.callback_func is not None:
|
|
||||||
self.callback_func(input_ids[0])
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
class Iteratorize:
|
class Iteratorize:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -2,13 +2,11 @@ import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import torch
|
|
||||||
from datasets import load_dataset
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from modules import shared
|
from modules import shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
from modules.models import clear_torch_cache, load_model, unload_model
|
from modules.models import load_model, unload_model
|
||||||
from modules.models_settings import get_model_metadata, update_model_parameters
|
from modules.models_settings import get_model_metadata, update_model_parameters
|
||||||
from modules.text_generation import encode
|
from modules.text_generation import encode
|
||||||
|
|
||||||
|
@ -39,6 +37,11 @@ def calculate_perplexity(models, input_dataset, stride, _max_length):
|
||||||
https://huggingface.co/docs/transformers/perplexity#calculating-ppl-with-fixedlength-models
|
https://huggingface.co/docs/transformers/perplexity#calculating-ppl-with-fixedlength-models
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
from modules.torch_utils import clear_torch_cache
|
||||||
|
|
||||||
if shared.args.loader == "llama.cpp":
|
if shared.args.loader == "llama.cpp":
|
||||||
logger.error("Perplexity evaluation is not implemented for the llama.cpp loader.")
|
logger.error("Perplexity evaluation is not implemented for the llama.cpp loader.")
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
|
|
@ -4,10 +4,6 @@ from pathlib import Path
|
||||||
from typing import Any, Dict, Optional, Union
|
from typing import Any, Dict, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch.nn import CrossEntropyLoss
|
|
||||||
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
|
|
||||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
|
||||||
|
|
||||||
from exllamav2 import (
|
from exllamav2 import (
|
||||||
ExLlamaV2,
|
ExLlamaV2,
|
||||||
ExLlamaV2Cache,
|
ExLlamaV2Cache,
|
||||||
|
@ -18,6 +14,15 @@ from exllamav2 import (
|
||||||
ExLlamaV2Cache_TP,
|
ExLlamaV2Cache_TP,
|
||||||
ExLlamaV2Config
|
ExLlamaV2Config
|
||||||
)
|
)
|
||||||
|
from torch.nn import CrossEntropyLoss
|
||||||
|
from transformers import (
|
||||||
|
GenerationConfig,
|
||||||
|
GenerationMixin,
|
||||||
|
PretrainedConfig,
|
||||||
|
PreTrainedModel
|
||||||
|
)
|
||||||
|
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||||
|
|
||||||
from modules import shared
|
from modules import shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
|
|
||||||
|
@ -28,7 +33,7 @@ except Exception:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
class Exllamav2HF(PreTrainedModel):
|
class Exllamav2HF(PreTrainedModel, GenerationMixin):
|
||||||
def __init__(self, config: ExLlamaV2Config):
|
def __init__(self, config: ExLlamaV2Config):
|
||||||
super().__init__(PretrainedConfig())
|
super().__init__(PretrainedConfig())
|
||||||
self.ex_config = config
|
self.ex_config = config
|
||||||
|
|
|
@ -6,7 +6,12 @@ from typing import Any, Dict, Optional, Union
|
||||||
import torch
|
import torch
|
||||||
from exllamav3 import Cache, Config, Model
|
from exllamav3 import Cache, Config, Model
|
||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
|
from transformers import (
|
||||||
|
GenerationConfig,
|
||||||
|
GenerationMixin,
|
||||||
|
PretrainedConfig,
|
||||||
|
PreTrainedModel
|
||||||
|
)
|
||||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||||
|
|
||||||
from modules import shared
|
from modules import shared
|
||||||
|
@ -19,7 +24,7 @@ except Exception:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
class Exllamav3HF(PreTrainedModel):
|
class Exllamav3HF(PreTrainedModel, GenerationMixin):
|
||||||
def __init__(self, model_dir):
|
def __init__(self, model_dir):
|
||||||
super().__init__(PretrainedConfig())
|
super().__init__(PretrainedConfig())
|
||||||
self.generation_config = GenerationConfig()
|
self.generation_config = GenerationConfig()
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import pprint
|
import pprint
|
||||||
import socket
|
import socket
|
||||||
import subprocess
|
import subprocess
|
||||||
|
@ -281,12 +282,21 @@ class LlamaServer:
|
||||||
if shared.args.rope_freq_base > 0:
|
if shared.args.rope_freq_base > 0:
|
||||||
cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)]
|
cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)]
|
||||||
|
|
||||||
|
env = os.environ.copy()
|
||||||
|
if os.name == 'posix':
|
||||||
|
current_path = env.get('LD_LIBRARY_PATH', '')
|
||||||
|
if current_path:
|
||||||
|
env['LD_LIBRARY_PATH'] = f"{current_path}:{os.path.dirname(self.server_path)}"
|
||||||
|
else:
|
||||||
|
env['LD_LIBRARY_PATH'] = os.path.dirname(self.server_path)
|
||||||
|
|
||||||
# Start the server with pipes for output
|
# Start the server with pipes for output
|
||||||
self.process = subprocess.Popen(
|
self.process = subprocess.Popen(
|
||||||
cmd,
|
cmd,
|
||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
text=True,
|
text=True,
|
||||||
bufsize=1
|
bufsize=1,
|
||||||
|
env=env
|
||||||
)
|
)
|
||||||
|
|
||||||
def filter_stderr(process_stderr):
|
def filter_stderr(process_stderr):
|
||||||
|
|
|
@ -3,29 +3,7 @@ from collections import OrderedDict
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
|
|
||||||
from modules import shared
|
|
||||||
|
|
||||||
loaders_and_params = OrderedDict({
|
loaders_and_params = OrderedDict({
|
||||||
'Transformers': [
|
|
||||||
'gpu_memory',
|
|
||||||
'cpu_memory',
|
|
||||||
'alpha_value',
|
|
||||||
'compress_pos_emb',
|
|
||||||
'compute_dtype',
|
|
||||||
'quant_type',
|
|
||||||
'load_in_8bit',
|
|
||||||
'load_in_4bit',
|
|
||||||
'torch_compile',
|
|
||||||
'use_flash_attention_2',
|
|
||||||
'auto_devices',
|
|
||||||
'cpu',
|
|
||||||
'disk',
|
|
||||||
'use_double_quant',
|
|
||||||
'use_eager_attention',
|
|
||||||
'bf16',
|
|
||||||
'trust_remote_code',
|
|
||||||
'no_use_fast',
|
|
||||||
],
|
|
||||||
'llama.cpp': [
|
'llama.cpp': [
|
||||||
'n_gpu_layers',
|
'n_gpu_layers',
|
||||||
'threads',
|
'threads',
|
||||||
|
@ -43,6 +21,25 @@ loaders_and_params = OrderedDict({
|
||||||
'mlock',
|
'mlock',
|
||||||
'numa',
|
'numa',
|
||||||
],
|
],
|
||||||
|
'Transformers': [
|
||||||
|
'gpu_split',
|
||||||
|
'cpu_memory',
|
||||||
|
'alpha_value',
|
||||||
|
'compress_pos_emb',
|
||||||
|
'compute_dtype',
|
||||||
|
'quant_type',
|
||||||
|
'load_in_8bit',
|
||||||
|
'load_in_4bit',
|
||||||
|
'torch_compile',
|
||||||
|
'use_flash_attention_2',
|
||||||
|
'cpu',
|
||||||
|
'disk',
|
||||||
|
'use_double_quant',
|
||||||
|
'use_eager_attention',
|
||||||
|
'bf16',
|
||||||
|
'trust_remote_code',
|
||||||
|
'no_use_fast',
|
||||||
|
],
|
||||||
'ExLlamav3_HF': [
|
'ExLlamav3_HF': [
|
||||||
'max_seq_len',
|
'max_seq_len',
|
||||||
'gpu_split',
|
'gpu_split',
|
||||||
|
@ -346,10 +343,6 @@ def blacklist_samplers(loader, dynamic_temperature):
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def get_gpu_memory_keys():
|
|
||||||
return [k for k in shared.gradio if k.startswith('gpu_memory')]
|
|
||||||
|
|
||||||
|
|
||||||
@functools.cache
|
@functools.cache
|
||||||
def get_all_params():
|
def get_all_params():
|
||||||
all_params = set()
|
all_params = set()
|
||||||
|
@ -357,11 +350,6 @@ def get_all_params():
|
||||||
for el in loaders_and_params[k]:
|
for el in loaders_and_params[k]:
|
||||||
all_params.add(el)
|
all_params.add(el)
|
||||||
|
|
||||||
if 'gpu_memory' in all_params:
|
|
||||||
all_params.remove('gpu_memory')
|
|
||||||
for k in get_gpu_memory_keys():
|
|
||||||
all_params.add(k)
|
|
||||||
|
|
||||||
return sorted(all_params)
|
return sorted(all_params)
|
||||||
|
|
||||||
|
|
||||||
|
@ -371,8 +359,4 @@ def make_loader_params_visible(loader):
|
||||||
if loader in loaders_and_params:
|
if loader in loaders_and_params:
|
||||||
params = loaders_and_params[loader]
|
params = loaders_and_params[loader]
|
||||||
|
|
||||||
if 'gpu_memory' in params:
|
|
||||||
params.remove('gpu_memory')
|
|
||||||
params += get_gpu_memory_keys()
|
|
||||||
|
|
||||||
return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params]
|
return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params]
|
||||||
|
|
|
@ -2,11 +2,10 @@ import time
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
|
||||||
|
|
||||||
from modules import models, sampler_hijack, shared
|
from modules import models, shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
from modules.models import get_device, load_model
|
from modules.models import load_model
|
||||||
from modules.text_generation import generate_reply
|
from modules.text_generation import generate_reply
|
||||||
|
|
||||||
global_scores = None
|
global_scores = None
|
||||||
|
@ -38,18 +37,16 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
|
||||||
logger.error("No model is loaded! Select one in the Model tab.")
|
logger.error("No model is loaded! Select one in the Model tab.")
|
||||||
return 'Error: No model is loaded1 Select one in the Model tab.', previous
|
return 'Error: No model is loaded1 Select one in the Model tab.', previous
|
||||||
|
|
||||||
is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
|
# llama.cpp case
|
||||||
is_llamacpp = shared.model.__class__.__name__ == 'LlamaServer'
|
if shared.model.__class__.__name__ == 'LlamaServer':
|
||||||
|
|
||||||
if is_llamacpp:
|
|
||||||
logprobs = shared.model.get_logits(prompt, state, n_probs=top_logits, use_samplers=use_samplers)
|
logprobs = shared.model.get_logits(prompt, state, n_probs=top_logits, use_samplers=use_samplers)
|
||||||
|
|
||||||
if return_dict:
|
if return_dict:
|
||||||
output = {}
|
output = {}
|
||||||
for entry in logprobs:
|
for entry in logprobs:
|
||||||
token = repr(entry['token'])
|
token = repr(entry['token'])
|
||||||
prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
|
prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
|
||||||
output[token] = prob
|
output[token] = prob
|
||||||
|
|
||||||
return output
|
return output
|
||||||
else:
|
else:
|
||||||
output = ''
|
output = ''
|
||||||
|
@ -57,9 +54,17 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
|
||||||
token = repr(entry['token'])
|
token = repr(entry['token'])
|
||||||
prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
|
prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
|
||||||
output += f"{prob:.5f} - {token}\n"
|
output += f"{prob:.5f} - {token}\n"
|
||||||
|
|
||||||
return output, previous
|
return output, previous
|
||||||
|
|
||||||
|
# All other model types
|
||||||
else:
|
else:
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from modules import sampler_hijack
|
||||||
|
from modules.torch_utils import get_device
|
||||||
|
|
||||||
|
is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
|
||||||
|
|
||||||
if not use_samplers:
|
if not use_samplers:
|
||||||
state = {'stream': True}
|
state = {'stream': True}
|
||||||
|
|
||||||
|
|
|
@ -1,61 +1,11 @@
|
||||||
import gc
|
import sys
|
||||||
import os
|
|
||||||
import pprint
|
|
||||||
import re
|
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import torch
|
|
||||||
import transformers
|
|
||||||
from accelerate import infer_auto_device_map, init_empty_weights
|
|
||||||
from accelerate.utils import (
|
|
||||||
is_ccl_available,
|
|
||||||
is_npu_available,
|
|
||||||
is_xpu_available
|
|
||||||
)
|
|
||||||
from transformers import (
|
|
||||||
AutoConfig,
|
|
||||||
AutoModel,
|
|
||||||
AutoModelForCausalLM,
|
|
||||||
AutoModelForSeq2SeqLM,
|
|
||||||
AutoTokenizer,
|
|
||||||
BitsAndBytesConfig,
|
|
||||||
is_torch_npu_available,
|
|
||||||
is_torch_xpu_available
|
|
||||||
)
|
|
||||||
|
|
||||||
import modules.shared as shared
|
import modules.shared as shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
from modules.models_settings import get_model_metadata
|
from modules.models_settings import get_model_metadata
|
||||||
|
|
||||||
transformers.logging.set_verbosity_error()
|
|
||||||
|
|
||||||
local_rank = None
|
|
||||||
if shared.args.deepspeed:
|
|
||||||
import deepspeed
|
|
||||||
from transformers.integrations.deepspeed import (
|
|
||||||
HfDeepSpeedConfig,
|
|
||||||
is_deepspeed_zero3_enabled
|
|
||||||
)
|
|
||||||
|
|
||||||
from modules.deepspeed_parameters import generate_ds_config
|
|
||||||
|
|
||||||
# Distributed setup
|
|
||||||
local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
|
|
||||||
world_size = int(os.getenv("WORLD_SIZE", "1"))
|
|
||||||
if is_xpu_available() and is_ccl_available():
|
|
||||||
torch.xpu.set_device(local_rank)
|
|
||||||
deepspeed.init_distributed(backend="ccl")
|
|
||||||
elif is_npu_available():
|
|
||||||
torch.npu.set_device(local_rank)
|
|
||||||
deepspeed.init_distributed(dist_backend="hccl")
|
|
||||||
else:
|
|
||||||
torch.cuda.set_device(local_rank)
|
|
||||||
deepspeed.init_distributed()
|
|
||||||
ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
|
|
||||||
dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration
|
|
||||||
|
|
||||||
|
|
||||||
last_generation_time = time.time()
|
last_generation_time = time.time()
|
||||||
|
|
||||||
|
|
||||||
|
@ -66,8 +16,8 @@ def load_model(model_name, loader=None):
|
||||||
shared.is_seq2seq = False
|
shared.is_seq2seq = False
|
||||||
shared.model_name = model_name
|
shared.model_name = model_name
|
||||||
load_func_map = {
|
load_func_map = {
|
||||||
'Transformers': huggingface_loader,
|
|
||||||
'llama.cpp': llama_cpp_server_loader,
|
'llama.cpp': llama_cpp_server_loader,
|
||||||
|
'Transformers': transformers_loader,
|
||||||
'ExLlamav3_HF': ExLlamav3_HF_loader,
|
'ExLlamav3_HF': ExLlamav3_HF_loader,
|
||||||
'ExLlamav2_HF': ExLlamav2_HF_loader,
|
'ExLlamav2_HF': ExLlamav2_HF_loader,
|
||||||
'ExLlamav2': ExLlamav2_loader,
|
'ExLlamav2': ExLlamav2_loader,
|
||||||
|
@ -85,8 +35,11 @@ def load_model(model_name, loader=None):
|
||||||
logger.error('The path to the model does not exist. Exiting.')
|
logger.error('The path to the model does not exist. Exiting.')
|
||||||
raise ValueError
|
raise ValueError
|
||||||
|
|
||||||
|
if loader != 'llama.cpp' and 'sampler_hijack' not in sys.modules:
|
||||||
|
from modules import sampler_hijack
|
||||||
|
sampler_hijack.hijack_samplers()
|
||||||
|
|
||||||
shared.args.loader = loader
|
shared.args.loader = loader
|
||||||
clear_torch_cache()
|
|
||||||
output = load_func_map[loader](model_name)
|
output = load_func_map[loader](model_name)
|
||||||
if type(output) is tuple:
|
if type(output) is tuple:
|
||||||
model, tokenizer = output
|
model, tokenizer = output
|
||||||
|
@ -95,6 +48,7 @@ def load_model(model_name, loader=None):
|
||||||
if model is None:
|
if model is None:
|
||||||
return None, None
|
return None, None
|
||||||
else:
|
else:
|
||||||
|
from modules.transformers_loader import load_tokenizer
|
||||||
tokenizer = load_tokenizer(model_name)
|
tokenizer = load_tokenizer(model_name)
|
||||||
|
|
||||||
shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
|
shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
|
||||||
|
@ -110,163 +64,6 @@ def load_model(model_name, loader=None):
|
||||||
return model, tokenizer
|
return model, tokenizer
|
||||||
|
|
||||||
|
|
||||||
def load_tokenizer(model_name, tokenizer_dir=None):
|
|
||||||
if tokenizer_dir:
|
|
||||||
path_to_model = Path(tokenizer_dir)
|
|
||||||
else:
|
|
||||||
path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
|
|
||||||
|
|
||||||
tokenizer = None
|
|
||||||
if path_to_model.exists():
|
|
||||||
if shared.args.no_use_fast:
|
|
||||||
logger.info('Loading the tokenizer with use_fast=False.')
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
|
||||||
path_to_model,
|
|
||||||
trust_remote_code=shared.args.trust_remote_code,
|
|
||||||
use_fast=not shared.args.no_use_fast
|
|
||||||
)
|
|
||||||
|
|
||||||
return tokenizer
|
|
||||||
|
|
||||||
|
|
||||||
def huggingface_loader(model_name):
|
|
||||||
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
|
|
||||||
params = {
|
|
||||||
'low_cpu_mem_usage': True,
|
|
||||||
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
|
|
||||||
}
|
|
||||||
|
|
||||||
if shared.args.trust_remote_code:
|
|
||||||
params['trust_remote_code'] = True
|
|
||||||
|
|
||||||
if shared.args.use_flash_attention_2:
|
|
||||||
params['use_flash_attention_2'] = True
|
|
||||||
|
|
||||||
if shared.args.force_safetensors:
|
|
||||||
params['force_safetensors'] = True
|
|
||||||
|
|
||||||
if shared.args.use_eager_attention:
|
|
||||||
params['attn_implementation'] = 'eager'
|
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
|
|
||||||
|
|
||||||
if 'chatglm' in model_name.lower():
|
|
||||||
LoaderClass = AutoModel
|
|
||||||
else:
|
|
||||||
if config.to_dict().get('is_encoder_decoder', False):
|
|
||||||
LoaderClass = AutoModelForSeq2SeqLM
|
|
||||||
shared.is_seq2seq = True
|
|
||||||
else:
|
|
||||||
LoaderClass = AutoModelForCausalLM
|
|
||||||
|
|
||||||
# Determine if we should use default loading
|
|
||||||
should_use_default_loading = not any([
|
|
||||||
shared.args.cpu,
|
|
||||||
shared.args.load_in_8bit,
|
|
||||||
shared.args.load_in_4bit,
|
|
||||||
shared.args.auto_devices,
|
|
||||||
shared.args.disk,
|
|
||||||
shared.args.deepspeed,
|
|
||||||
shared.args.gpu_memory is not None,
|
|
||||||
shared.args.cpu_memory is not None,
|
|
||||||
shared.args.compress_pos_emb > 1,
|
|
||||||
shared.args.alpha_value > 1,
|
|
||||||
])
|
|
||||||
|
|
||||||
# Load the model without any special settings
|
|
||||||
if should_use_default_loading:
|
|
||||||
logger.info("TRANSFORMERS_PARAMS=")
|
|
||||||
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
|
|
||||||
print()
|
|
||||||
|
|
||||||
model = LoaderClass.from_pretrained(path_to_model, **params)
|
|
||||||
if not (hasattr(model, 'is_loaded_in_4bit') and model.is_loaded_in_4bit):
|
|
||||||
device = get_device()
|
|
||||||
if device:
|
|
||||||
model = model.to(device)
|
|
||||||
|
|
||||||
# DeepSpeed ZeRO-3
|
|
||||||
elif shared.args.deepspeed:
|
|
||||||
model = LoaderClass.from_pretrained(
|
|
||||||
path_to_model,
|
|
||||||
torch_dtype=params['torch_dtype'],
|
|
||||||
trust_remote_code=params.get('trust_remote_code')
|
|
||||||
)
|
|
||||||
|
|
||||||
model = deepspeed.initialize(
|
|
||||||
model=model,
|
|
||||||
config_params=ds_config,
|
|
||||||
model_parameters=None,
|
|
||||||
optimizer=None,
|
|
||||||
lr_scheduler=None
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
model.module.eval() # Inference
|
|
||||||
logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')
|
|
||||||
|
|
||||||
# Load with quantization and/or offloading
|
|
||||||
else:
|
|
||||||
if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())):
|
|
||||||
logger.warning('torch.cuda.is_available() and is_xpu_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.')
|
|
||||||
shared.args.cpu = True
|
|
||||||
|
|
||||||
if shared.args.cpu:
|
|
||||||
params['torch_dtype'] = torch.float32
|
|
||||||
else:
|
|
||||||
params['device_map'] = 'auto'
|
|
||||||
if x := get_max_memory_dict():
|
|
||||||
params['max_memory'] = x
|
|
||||||
|
|
||||||
if shared.args.load_in_4bit:
|
|
||||||
# See https://github.com/huggingface/transformers/pull/23479/files
|
|
||||||
# and https://huggingface.co/blog/4bit-transformers-bitsandbytes
|
|
||||||
quantization_config_params = {
|
|
||||||
'load_in_4bit': True,
|
|
||||||
'bnb_4bit_compute_dtype': eval(f"torch.{shared.args.compute_dtype}") if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
|
|
||||||
'bnb_4bit_quant_type': shared.args.quant_type,
|
|
||||||
'bnb_4bit_use_double_quant': shared.args.use_double_quant,
|
|
||||||
'llm_int8_enable_fp32_cpu_offload': True
|
|
||||||
}
|
|
||||||
params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)
|
|
||||||
|
|
||||||
elif shared.args.load_in_8bit:
|
|
||||||
if shared.args.auto_devices or shared.args.gpu_memory:
|
|
||||||
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
|
|
||||||
else:
|
|
||||||
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True)
|
|
||||||
|
|
||||||
if params.get('max_memory') is not None:
|
|
||||||
with init_empty_weights():
|
|
||||||
model = LoaderClass.from_config(config, trust_remote_code=params.get('trust_remote_code'))
|
|
||||||
|
|
||||||
model.tie_weights()
|
|
||||||
params['device_map'] = infer_auto_device_map(
|
|
||||||
model,
|
|
||||||
dtype=torch.int8,
|
|
||||||
max_memory=params.get('max_memory'),
|
|
||||||
no_split_module_classes=model._no_split_modules
|
|
||||||
)
|
|
||||||
|
|
||||||
if shared.args.disk:
|
|
||||||
params['offload_folder'] = shared.args.disk_cache_dir
|
|
||||||
|
|
||||||
if shared.args.compress_pos_emb > 1:
|
|
||||||
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
|
|
||||||
elif shared.args.alpha_value > 1:
|
|
||||||
params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
|
|
||||||
|
|
||||||
logger.info("TRANSFORMERS_PARAMS=")
|
|
||||||
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
|
|
||||||
print()
|
|
||||||
model = LoaderClass.from_pretrained(path_to_model, **params)
|
|
||||||
|
|
||||||
if shared.args.torch_compile:
|
|
||||||
model = torch.compile(model)
|
|
||||||
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def llama_cpp_server_loader(model_name):
|
def llama_cpp_server_loader(model_name):
|
||||||
from modules.llama_cpp_server import LlamaServer
|
from modules.llama_cpp_server import LlamaServer
|
||||||
|
|
||||||
|
@ -284,6 +81,11 @@ def llama_cpp_server_loader(model_name):
|
||||||
logger.error(f"Error loading the model with llama.cpp: {str(e)}")
|
logger.error(f"Error loading the model with llama.cpp: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
def transformers_loader(model_name):
|
||||||
|
from modules.transformers_loader import load_model_HF
|
||||||
|
return load_model_HF(model_name)
|
||||||
|
|
||||||
|
|
||||||
def ExLlamav3_HF_loader(model_name):
|
def ExLlamav3_HF_loader(model_name):
|
||||||
from modules.exllamav3_hf import Exllamav3HF
|
from modules.exllamav3_hf import Exllamav3HF
|
||||||
|
|
||||||
|
@ -328,71 +130,18 @@ def TensorRT_LLM_loader(model_name):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def get_max_memory_dict():
|
|
||||||
max_memory = {}
|
|
||||||
max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
|
|
||||||
if shared.args.gpu_memory:
|
|
||||||
memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory))
|
|
||||||
for i in range(len(memory_map)):
|
|
||||||
max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
|
|
||||||
|
|
||||||
max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
|
|
||||||
|
|
||||||
# If --auto-devices is provided standalone, try to get a reasonable value
|
|
||||||
# for the maximum memory of device :0
|
|
||||||
elif shared.args.auto_devices:
|
|
||||||
if is_xpu_available():
|
|
||||||
total_mem = (torch.xpu.get_device_properties(0).total_memory / (1024 * 1024))
|
|
||||||
else:
|
|
||||||
total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
|
|
||||||
|
|
||||||
suggestion = round((total_mem - 1000) / 1000) * 1000
|
|
||||||
if total_mem - suggestion < 800:
|
|
||||||
suggestion -= 1000
|
|
||||||
|
|
||||||
suggestion = int(round(suggestion / 1000))
|
|
||||||
logger.warning(f"Auto-assiging --gpu-memory {suggestion} for your GPU to try to prevent out-of-memory errors. You can manually set other values.")
|
|
||||||
max_memory[0] = f'{suggestion}GiB'
|
|
||||||
max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
|
|
||||||
|
|
||||||
return max_memory if len(max_memory) > 0 else None
|
|
||||||
|
|
||||||
|
|
||||||
def get_device():
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
return torch.device('cuda')
|
|
||||||
elif shared.args.deepspeed:
|
|
||||||
import deepspeed
|
|
||||||
return deepspeed.get_accelerator().current_device_name()
|
|
||||||
elif torch.backends.mps.is_available():
|
|
||||||
return torch.device('mps')
|
|
||||||
elif is_torch_xpu_available():
|
|
||||||
return torch.device('xpu:0')
|
|
||||||
elif is_torch_npu_available():
|
|
||||||
return torch.device('npu:0')
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def clear_torch_cache():
|
|
||||||
gc.collect()
|
|
||||||
if not shared.args.cpu:
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
elif is_xpu_available():
|
|
||||||
torch.xpu.empty_cache()
|
|
||||||
elif is_npu_available():
|
|
||||||
torch.npu.empty_cache()
|
|
||||||
elif torch.backends.mps.is_available():
|
|
||||||
if hasattr(torch.backends.mps, 'empty_cache'):
|
|
||||||
torch.backends.mps.empty_cache()
|
|
||||||
|
|
||||||
|
|
||||||
def unload_model(keep_model_name=False):
|
def unload_model(keep_model_name=False):
|
||||||
|
if shared.model is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
|
||||||
|
|
||||||
shared.model = shared.tokenizer = None
|
shared.model = shared.tokenizer = None
|
||||||
shared.lora_names = []
|
shared.lora_names = []
|
||||||
shared.model_dirty_from_training = False
|
shared.model_dirty_from_training = False
|
||||||
clear_torch_cache()
|
if not is_llamacpp:
|
||||||
|
from modules.torch_utils import clear_torch_cache
|
||||||
|
clear_torch_cache()
|
||||||
|
|
||||||
if not keep_model_name:
|
if not keep_model_name:
|
||||||
shared.model_name = 'None'
|
shared.model_name = 'None'
|
||||||
|
|
|
@ -188,41 +188,20 @@ def update_model_parameters(state, initial=False):
|
||||||
UI: update the command-line arguments based on the interface values
|
UI: update the command-line arguments based on the interface values
|
||||||
'''
|
'''
|
||||||
elements = ui.list_model_elements() # the names of the parameters
|
elements = ui.list_model_elements() # the names of the parameters
|
||||||
gpu_memories = []
|
|
||||||
|
|
||||||
for i, element in enumerate(elements):
|
for i, element in enumerate(elements):
|
||||||
if element not in state:
|
if element not in state:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
value = state[element]
|
value = state[element]
|
||||||
if element.startswith('gpu_memory'):
|
|
||||||
gpu_memories.append(value)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if initial and element in shared.provided_arguments:
|
if initial and element in shared.provided_arguments:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if element in ['cpu_memory'] and value == 0:
|
if element == 'cpu_memory' and value == 0:
|
||||||
value = vars(shared.args_defaults)[element]
|
value = vars(shared.args_defaults)[element]
|
||||||
|
|
||||||
# Making some simple conversions
|
|
||||||
if element == 'cpu_memory' and value is not None:
|
|
||||||
value = f"{value}MiB"
|
|
||||||
|
|
||||||
setattr(shared.args, element, value)
|
setattr(shared.args, element, value)
|
||||||
|
|
||||||
found_positive = False
|
|
||||||
for i in gpu_memories:
|
|
||||||
if i > 0:
|
|
||||||
found_positive = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if not (initial and vars(shared.args)['gpu_memory'] != vars(shared.args_defaults)['gpu_memory']):
|
|
||||||
if found_positive:
|
|
||||||
shared.args.gpu_memory = [f"{i}MiB" for i in gpu_memories]
|
|
||||||
else:
|
|
||||||
shared.args.gpu_memory = None
|
|
||||||
|
|
||||||
|
|
||||||
def apply_model_settings_to_state(model, state):
|
def apply_model_settings_to_state(model, state):
|
||||||
'''
|
'''
|
||||||
|
|
|
@ -13,7 +13,10 @@ from transformers.generation.logits_process import (
|
||||||
|
|
||||||
from modules import shared
|
from modules import shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
from modules.models import get_device
|
from modules.torch_utils import get_device
|
||||||
|
|
||||||
|
original_init = transformers.GenerationConfig.__init__
|
||||||
|
original_get_logits_processor = transformers.GenerationMixin._get_logits_processor
|
||||||
|
|
||||||
global_scores = None
|
global_scores = None
|
||||||
|
|
||||||
|
@ -484,7 +487,7 @@ def get_logits_processor_patch(self, **kwargs):
|
||||||
generation_config.temperature = float(generation_config.temperature) # Must be float
|
generation_config.temperature = float(generation_config.temperature) # Must be float
|
||||||
|
|
||||||
# Get the original warpers
|
# Get the original warpers
|
||||||
warpers = self._get_logits_processor_old(**kwargs)
|
warpers = original_get_logits_processor(self, **kwargs)
|
||||||
|
|
||||||
for i in range(len(warpers) - 1, -1, -1):
|
for i in range(len(warpers) - 1, -1, -1):
|
||||||
# Replace temperature with our modified class.
|
# Replace temperature with our modified class.
|
||||||
|
@ -674,7 +677,7 @@ def get_logits_processor_patch(self, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
def generation_config_init_patch(self, **kwargs):
|
def generation_config_init_patch(self, **kwargs):
|
||||||
self.__init___old(**kwargs)
|
original_init(self, **kwargs)
|
||||||
self.min_p = kwargs.pop("min_p", 0.0)
|
self.min_p = kwargs.pop("min_p", 0.0)
|
||||||
self.dynamic_temperature = kwargs.pop("dynamic_temperature", False)
|
self.dynamic_temperature = kwargs.pop("dynamic_temperature", False)
|
||||||
self.dynatemp_low = kwargs.pop("dynatemp_low", 1)
|
self.dynatemp_low = kwargs.pop("dynatemp_low", 1)
|
||||||
|
@ -702,8 +705,5 @@ def generation_config_init_patch(self, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
def hijack_samplers():
|
def hijack_samplers():
|
||||||
transformers.GenerationMixin._get_logits_processor_old = transformers.GenerationMixin._get_logits_processor
|
|
||||||
transformers.GenerationMixin._get_logits_processor = get_logits_processor_patch
|
transformers.GenerationMixin._get_logits_processor = get_logits_processor_patch
|
||||||
|
|
||||||
transformers.GenerationConfig.__init___old = transformers.GenerationConfig.__init__
|
|
||||||
transformers.GenerationConfig.__init__ = generation_config_init_patch
|
transformers.GenerationConfig.__init__ = generation_config_init_patch
|
||||||
|
|
|
@ -79,6 +79,7 @@ group.add_argument('--model', type=str, help='Name of the model to load by defau
|
||||||
group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.')
|
group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.')
|
||||||
group.add_argument('--model-dir', type=str, default='models/', help='Path to directory with all the models.')
|
group.add_argument('--model-dir', type=str, default='models/', help='Path to directory with all the models.')
|
||||||
group.add_argument('--lora-dir', type=str, default='loras/', help='Path to directory with all the loras.')
|
group.add_argument('--lora-dir', type=str, default='loras/', help='Path to directory with all the loras.')
|
||||||
|
group.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.')
|
||||||
group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
|
group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
|
||||||
group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
|
group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
|
||||||
group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
|
group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
|
||||||
|
@ -91,9 +92,7 @@ group.add_argument('--loader', type=str, help='Choose the model loader manually,
|
||||||
# Transformers/Accelerate
|
# Transformers/Accelerate
|
||||||
group = parser.add_argument_group('Transformers/Accelerate')
|
group = parser.add_argument_group('Transformers/Accelerate')
|
||||||
group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
|
group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
|
||||||
group.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
|
group.add_argument('--cpu-memory', type=float, default=0, help='Maximum CPU memory in GiB. Use this for CPU offloading.')
|
||||||
group.add_argument('--gpu-memory', type=str, nargs='+', help='Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.')
|
|
||||||
group.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.')
|
|
||||||
group.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
|
group.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
|
||||||
group.add_argument('--disk-cache-dir', type=str, default='cache', help='Directory to save the disk cache to. Defaults to "cache".')
|
group.add_argument('--disk-cache-dir', type=str, default='cache', help='Directory to save the disk cache to. Defaults to "cache".')
|
||||||
group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).')
|
group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).')
|
||||||
|
|
|
@ -7,33 +7,18 @@ import time
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
|
||||||
import transformers
|
|
||||||
from transformers import (
|
|
||||||
LogitsProcessorList,
|
|
||||||
is_torch_npu_available,
|
|
||||||
is_torch_xpu_available
|
|
||||||
)
|
|
||||||
|
|
||||||
import modules.shared as shared
|
import modules.shared as shared
|
||||||
from modules import models, sampler_hijack
|
from modules import models
|
||||||
from modules.callbacks import (
|
from modules.callbacks import Iteratorize
|
||||||
Iteratorize,
|
|
||||||
Stream,
|
|
||||||
_StopEverythingStoppingCriteria
|
|
||||||
)
|
|
||||||
from modules.extensions import apply_extensions
|
from modules.extensions import apply_extensions
|
||||||
from modules.grammar.grammar_utils import initialize_grammar
|
|
||||||
from modules.grammar.logits_process import GrammarConstrainedLogitsProcessor
|
|
||||||
from modules.html_generator import generate_basic_html
|
from modules.html_generator import generate_basic_html
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
from modules.models import clear_torch_cache, get_device, load_model
|
|
||||||
|
|
||||||
sampler_hijack.hijack_samplers()
|
|
||||||
|
|
||||||
|
|
||||||
def generate_reply(*args, **kwargs):
|
def generate_reply(*args, **kwargs):
|
||||||
if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
|
if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
|
||||||
|
from modules.models import load_model
|
||||||
shared.model, shared.tokenizer = load_model(shared.model_name)
|
shared.model, shared.tokenizer = load_model(shared.model_name)
|
||||||
|
|
||||||
shared.generation_lock.acquire()
|
shared.generation_lock.acquire()
|
||||||
|
@ -46,7 +31,6 @@ def generate_reply(*args, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
def _generate_reply(question, state, stopping_strings=None, is_chat=False, escape_html=False, for_ui=False):
|
def _generate_reply(question, state, stopping_strings=None, is_chat=False, escape_html=False, for_ui=False):
|
||||||
|
|
||||||
# Find the appropriate generation function
|
# Find the appropriate generation function
|
||||||
generate_func = apply_extensions('custom_generate_reply')
|
generate_func = apply_extensions('custom_generate_reply')
|
||||||
if generate_func is None:
|
if generate_func is None:
|
||||||
|
@ -80,7 +64,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
|
||||||
all_stop_strings += st
|
all_stop_strings += st
|
||||||
|
|
||||||
shared.stop_everything = False
|
shared.stop_everything = False
|
||||||
seed = set_manual_seed(state['seed'])
|
|
||||||
last_update = -1
|
last_update = -1
|
||||||
reply = ''
|
reply = ''
|
||||||
is_stream = state['stream']
|
is_stream = state['stream']
|
||||||
|
@ -93,7 +76,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
|
||||||
min_update_interval = 1 / state['max_updates_second']
|
min_update_interval = 1 / state['max_updates_second']
|
||||||
|
|
||||||
# Generate
|
# Generate
|
||||||
for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
|
for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat):
|
||||||
reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
|
reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
|
||||||
if escape_html:
|
if escape_html:
|
||||||
reply = html.escape(reply)
|
reply = html.escape(reply)
|
||||||
|
@ -132,44 +115,55 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
|
||||||
if shared.tokenizer is None:
|
if shared.tokenizer is None:
|
||||||
raise ValueError('No tokenizer is loaded')
|
raise ValueError('No tokenizer is loaded')
|
||||||
|
|
||||||
if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel']:
|
# llama.cpp case
|
||||||
if shared.model.__class__.__name__ == 'LlamaServer':
|
if shared.model.__class__.__name__ == 'LlamaServer':
|
||||||
input_ids = shared.tokenizer.encode(str(prompt), add_bos_token=add_bos_token)
|
input_ids = shared.tokenizer.encode(str(prompt), add_bos_token=add_bos_token)
|
||||||
else:
|
input_ids = np.array(input_ids).reshape(1, len(input_ids))
|
||||||
|
|
||||||
|
if truncation_length is not None:
|
||||||
|
input_ids = input_ids[:, -truncation_length:]
|
||||||
|
|
||||||
|
return input_ids
|
||||||
|
|
||||||
|
# All other model types
|
||||||
|
else:
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from modules.torch_utils import get_device
|
||||||
|
|
||||||
|
if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel']:
|
||||||
input_ids = shared.tokenizer.encode(str(prompt))
|
input_ids = shared.tokenizer.encode(str(prompt))
|
||||||
|
if shared.model.__class__.__name__ != 'Exllamav2Model':
|
||||||
|
input_ids = np.array(input_ids).reshape(1, len(input_ids))
|
||||||
|
else:
|
||||||
|
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
|
||||||
|
|
||||||
if shared.model.__class__.__name__ not in ['Exllamav2Model']:
|
if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None:
|
||||||
input_ids = np.array(input_ids).reshape(1, len(input_ids))
|
if add_bos_token:
|
||||||
else:
|
# Add BOS token if missing
|
||||||
input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
|
if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0:
|
||||||
|
bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]])
|
||||||
|
input_ids = torch.cat((bos_tensor, input_ids), 1)
|
||||||
|
|
||||||
if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None:
|
# Prevent double BOS tokens from jinja templates
|
||||||
if add_bos_token:
|
while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id:
|
||||||
if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0:
|
input_ids = input_ids[:, 1:]
|
||||||
# Add a missing bos token (it may not have been added due to faulty model metadata)
|
else:
|
||||||
bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]])
|
# Remove BOS tokens when not wanted
|
||||||
input_ids = torch.cat((bos_tensor, input_ids), 1)
|
while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id:
|
||||||
|
input_ids = input_ids[:, 1:]
|
||||||
|
|
||||||
# Prevent double bos token due to jinja templates with <s> somewhere
|
if truncation_length is not None:
|
||||||
while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id:
|
input_ids = input_ids[:, -truncation_length:]
|
||||||
input_ids = input_ids[:, 1:]
|
|
||||||
else:
|
|
||||||
# Remove any bos token that may have been added
|
|
||||||
while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id:
|
|
||||||
input_ids = input_ids[:, 1:]
|
|
||||||
|
|
||||||
# Handling truncation
|
if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
|
||||||
if truncation_length is not None:
|
return input_ids
|
||||||
input_ids = input_ids[:, -truncation_length:]
|
else:
|
||||||
|
device = get_device()
|
||||||
|
if device:
|
||||||
|
return input_ids.to(device)
|
||||||
|
|
||||||
if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
|
return input_ids
|
||||||
return input_ids
|
|
||||||
else:
|
|
||||||
device = get_device()
|
|
||||||
if device:
|
|
||||||
return input_ids.to(device)
|
|
||||||
|
|
||||||
return input_ids
|
|
||||||
|
|
||||||
|
|
||||||
def decode(output_ids, skip_special_tokens=True):
|
def decode(output_ids, skip_special_tokens=True):
|
||||||
|
@ -225,13 +219,17 @@ def set_manual_seed(seed):
|
||||||
if seed == -1:
|
if seed == -1:
|
||||||
seed = random.randint(1, 2**31)
|
seed = random.randint(1, 2**31)
|
||||||
|
|
||||||
torch.manual_seed(seed)
|
if shared.args.loader != 'llama.cpp':
|
||||||
if torch.cuda.is_available():
|
import torch
|
||||||
torch.cuda.manual_seed_all(seed)
|
from transformers import is_torch_npu_available, is_torch_xpu_available
|
||||||
elif is_torch_xpu_available():
|
|
||||||
torch.xpu.manual_seed_all(seed)
|
torch.manual_seed(seed)
|
||||||
elif is_torch_npu_available():
|
if torch.cuda.is_available():
|
||||||
torch.npu.manual_seed_all(seed)
|
torch.cuda.manual_seed_all(seed)
|
||||||
|
elif is_torch_xpu_available():
|
||||||
|
torch.xpu.manual_seed_all(seed)
|
||||||
|
elif is_torch_npu_available():
|
||||||
|
torch.npu.manual_seed_all(seed)
|
||||||
|
|
||||||
return seed
|
return seed
|
||||||
|
|
||||||
|
@ -285,10 +283,26 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
|
||||||
return reply
|
return reply
|
||||||
|
|
||||||
|
|
||||||
def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
|
def generate_reply_HF(question, original_question, state, stopping_strings=None, is_chat=False):
|
||||||
|
import torch
|
||||||
|
import transformers
|
||||||
|
from transformers import LogitsProcessorList
|
||||||
|
|
||||||
|
from modules.grammar.grammar_utils import initialize_grammar
|
||||||
|
from modules.grammar.logits_process import (
|
||||||
|
GrammarConstrainedLogitsProcessor
|
||||||
|
)
|
||||||
|
from modules.torch_utils import clear_torch_cache, get_device
|
||||||
|
from modules.transformers_loader import (
|
||||||
|
Stream,
|
||||||
|
_StopEverythingStoppingCriteria
|
||||||
|
)
|
||||||
|
|
||||||
if shared.args.loader == 'Transformers':
|
if shared.args.loader == 'Transformers':
|
||||||
clear_torch_cache()
|
clear_torch_cache()
|
||||||
|
|
||||||
|
seed = set_manual_seed(state['seed'])
|
||||||
|
|
||||||
generate_params = {}
|
generate_params = {}
|
||||||
for k in [
|
for k in [
|
||||||
'temperature',
|
'temperature',
|
||||||
|
@ -458,12 +472,12 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
def generate_reply_custom(question, original_question, seed, state, stopping_strings=None, is_chat=False):
|
def generate_reply_custom(question, original_question, state, stopping_strings=None, is_chat=False):
|
||||||
"""
|
"""
|
||||||
For models that do not use the transformers library for sampling
|
For models that do not use the transformers library for sampling
|
||||||
"""
|
"""
|
||||||
seed = set_manual_seed(state['seed'])
|
|
||||||
|
|
||||||
|
seed = set_manual_seed(state['seed'])
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
reply = ''
|
reply = ''
|
||||||
try:
|
try:
|
||||||
|
|
37
modules/torch_utils.py
Normal file
37
modules/torch_utils.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
import gc
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from accelerate.utils import is_npu_available, is_xpu_available
|
||||||
|
from transformers import is_torch_npu_available, is_torch_xpu_available
|
||||||
|
|
||||||
|
from modules import shared
|
||||||
|
|
||||||
|
|
||||||
|
def get_device():
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
return torch.device('cuda')
|
||||||
|
elif shared.args.deepspeed:
|
||||||
|
import deepspeed
|
||||||
|
return deepspeed.get_accelerator().current_device_name()
|
||||||
|
elif torch.backends.mps.is_available():
|
||||||
|
return torch.device('mps')
|
||||||
|
elif is_torch_xpu_available():
|
||||||
|
return torch.device('xpu:0')
|
||||||
|
elif is_torch_npu_available():
|
||||||
|
return torch.device('npu:0')
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def clear_torch_cache():
|
||||||
|
gc.collect()
|
||||||
|
if not shared.args.cpu:
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
elif is_xpu_available():
|
||||||
|
torch.xpu.empty_cache()
|
||||||
|
elif is_npu_available():
|
||||||
|
torch.npu.empty_cache()
|
||||||
|
elif torch.backends.mps.is_available():
|
||||||
|
if hasattr(torch.backends.mps, 'empty_cache'):
|
||||||
|
torch.backends.mps.empty_cache()
|
|
@ -15,13 +15,6 @@ from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
import torch
|
|
||||||
import transformers
|
|
||||||
from datasets import Dataset, load_dataset
|
|
||||||
from transformers import is_torch_xpu_available
|
|
||||||
from transformers.models.auto.modeling_auto import (
|
|
||||||
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
|
||||||
)
|
|
||||||
|
|
||||||
from modules import shared, ui, utils
|
from modules import shared, ui, utils
|
||||||
from modules.evaluate import (
|
from modules.evaluate import (
|
||||||
|
@ -33,7 +26,6 @@ from modules.logging_colors import logger
|
||||||
from modules.models import reload_model
|
from modules.models import reload_model
|
||||||
from modules.utils import natural_keys
|
from modules.utils import natural_keys
|
||||||
|
|
||||||
MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()}
|
|
||||||
PARAMETERS = ["lora_name", "always_override", "q_proj_en", "v_proj_en", "k_proj_en", "o_proj_en", "gate_proj_en", "down_proj_en", "up_proj_en", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to"]
|
PARAMETERS = ["lora_name", "always_override", "q_proj_en", "v_proj_en", "k_proj_en", "o_proj_en", "gate_proj_en", "down_proj_en", "up_proj_en", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to"]
|
||||||
WANT_INTERRUPT = False
|
WANT_INTERRUPT = False
|
||||||
|
|
||||||
|
@ -284,6 +276,9 @@ def calc_trainable_parameters(model):
|
||||||
|
|
||||||
def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str):
|
def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str):
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import transformers
|
||||||
|
from datasets import Dataset, load_dataset
|
||||||
from peft import (
|
from peft import (
|
||||||
LoraConfig,
|
LoraConfig,
|
||||||
get_peft_model,
|
get_peft_model,
|
||||||
|
@ -293,6 +288,12 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
|
||||||
from peft.utils.other import \
|
from peft.utils.other import \
|
||||||
TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as \
|
TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as \
|
||||||
model_to_lora_modules
|
model_to_lora_modules
|
||||||
|
from transformers import is_torch_xpu_available
|
||||||
|
from transformers.models.auto.modeling_auto import (
|
||||||
|
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
|
||||||
|
)
|
||||||
|
|
||||||
|
MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()}
|
||||||
|
|
||||||
global WANT_INTERRUPT
|
global WANT_INTERRUPT
|
||||||
WANT_INTERRUPT = False
|
WANT_INTERRUPT = False
|
||||||
|
|
279
modules/transformers_loader.py
Normal file
279
modules/transformers_loader.py
Normal file
|
@ -0,0 +1,279 @@
|
||||||
|
import os
|
||||||
|
import pprint
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import transformers
|
||||||
|
from accelerate import infer_auto_device_map, init_empty_weights
|
||||||
|
from accelerate.utils import (
|
||||||
|
is_ccl_available,
|
||||||
|
is_npu_available,
|
||||||
|
is_xpu_available
|
||||||
|
)
|
||||||
|
from transformers import (
|
||||||
|
AutoConfig,
|
||||||
|
AutoModel,
|
||||||
|
AutoModelForCausalLM,
|
||||||
|
AutoModelForSeq2SeqLM,
|
||||||
|
AutoTokenizer,
|
||||||
|
BitsAndBytesConfig,
|
||||||
|
LogitsProcessor
|
||||||
|
)
|
||||||
|
|
||||||
|
import modules.shared as shared
|
||||||
|
from modules.logging_colors import logger
|
||||||
|
from modules.text_generation import get_reply_from_output_ids
|
||||||
|
from modules.torch_utils import get_device
|
||||||
|
|
||||||
|
transformers.logging.set_verbosity_error()
|
||||||
|
|
||||||
|
local_rank = None
|
||||||
|
if shared.args.deepspeed:
|
||||||
|
import deepspeed
|
||||||
|
from transformers.integrations.deepspeed import (
|
||||||
|
HfDeepSpeedConfig,
|
||||||
|
is_deepspeed_zero3_enabled
|
||||||
|
)
|
||||||
|
|
||||||
|
from modules.deepspeed_parameters import generate_ds_config
|
||||||
|
|
||||||
|
# Distributed setup
|
||||||
|
local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
|
||||||
|
world_size = int(os.getenv("WORLD_SIZE", "1"))
|
||||||
|
if is_xpu_available() and is_ccl_available():
|
||||||
|
torch.xpu.set_device(local_rank)
|
||||||
|
deepspeed.init_distributed(backend="ccl")
|
||||||
|
elif is_npu_available():
|
||||||
|
torch.npu.set_device(local_rank)
|
||||||
|
deepspeed.init_distributed(dist_backend="hccl")
|
||||||
|
else:
|
||||||
|
torch.cuda.set_device(local_rank)
|
||||||
|
deepspeed.init_distributed()
|
||||||
|
ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
|
||||||
|
dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration
|
||||||
|
|
||||||
|
|
||||||
|
class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
|
||||||
|
def __init__(self):
|
||||||
|
transformers.StoppingCriteria.__init__(self)
|
||||||
|
|
||||||
|
def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
|
||||||
|
return shared.stop_everything
|
||||||
|
|
||||||
|
|
||||||
|
class Stream(transformers.StoppingCriteria):
|
||||||
|
def __init__(self, callback_func=None):
|
||||||
|
self.callback_func = callback_func
|
||||||
|
|
||||||
|
def __call__(self, input_ids, scores) -> bool:
|
||||||
|
if self.callback_func is not None:
|
||||||
|
self.callback_func(input_ids[0])
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class LogitsBiasProcessor(LogitsProcessor):
|
||||||
|
def __init__(self, logit_bias={}):
|
||||||
|
self.logit_bias = logit_bias
|
||||||
|
if self.logit_bias:
|
||||||
|
self.keys = list([int(key) for key in self.logit_bias.keys()])
|
||||||
|
values = [self.logit_bias[str(key)] for key in self.keys]
|
||||||
|
self.values = torch.tensor(values, dtype=torch.float, device=shared.model.device)
|
||||||
|
|
||||||
|
def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
|
||||||
|
if self.logit_bias:
|
||||||
|
logits[0, self.keys] += self.values
|
||||||
|
|
||||||
|
return logits
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"<{self.__class__.__name__}(logit_bias={self.logit_bias})>"
|
||||||
|
|
||||||
|
|
||||||
|
class LogprobProcessor(LogitsProcessor):
|
||||||
|
def __init__(self, logprobs=None):
|
||||||
|
self.logprobs = logprobs
|
||||||
|
self.token_alternatives = {}
|
||||||
|
|
||||||
|
def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
|
||||||
|
if self.logprobs is not None: # 0-5
|
||||||
|
log_e_probabilities = F.log_softmax(logits, dim=1)
|
||||||
|
top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs + 1)
|
||||||
|
top_tokens = [get_reply_from_output_ids([tok]) for tok in top_indices[0]]
|
||||||
|
top_probs = [float(x) for x in top_values[0]]
|
||||||
|
self.token_alternatives = dict(zip(top_tokens, top_probs))
|
||||||
|
|
||||||
|
return logits
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"<{self.__class__.__name__}(logprobs={self.logprobs}, token_alternatives={self.token_alternatives})>"
|
||||||
|
|
||||||
|
|
||||||
|
def load_tokenizer(model_name, tokenizer_dir=None):
|
||||||
|
if tokenizer_dir:
|
||||||
|
path_to_model = Path(tokenizer_dir)
|
||||||
|
else:
|
||||||
|
path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
|
||||||
|
|
||||||
|
tokenizer = None
|
||||||
|
if path_to_model.exists():
|
||||||
|
if shared.args.no_use_fast:
|
||||||
|
logger.info('Loading the tokenizer with use_fast=False.')
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
path_to_model,
|
||||||
|
trust_remote_code=shared.args.trust_remote_code,
|
||||||
|
use_fast=not shared.args.no_use_fast
|
||||||
|
)
|
||||||
|
|
||||||
|
return tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def load_model_HF(model_name):
|
||||||
|
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
|
||||||
|
params = {
|
||||||
|
'low_cpu_mem_usage': True,
|
||||||
|
'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
|
||||||
|
}
|
||||||
|
|
||||||
|
if shared.args.trust_remote_code:
|
||||||
|
params['trust_remote_code'] = True
|
||||||
|
|
||||||
|
if shared.args.use_flash_attention_2:
|
||||||
|
params['use_flash_attention_2'] = True
|
||||||
|
|
||||||
|
if shared.args.force_safetensors:
|
||||||
|
params['force_safetensors'] = True
|
||||||
|
|
||||||
|
if shared.args.use_eager_attention:
|
||||||
|
params['attn_implementation'] = 'eager'
|
||||||
|
|
||||||
|
config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
|
||||||
|
|
||||||
|
if 'chatglm' in model_name.lower():
|
||||||
|
LoaderClass = AutoModel
|
||||||
|
else:
|
||||||
|
if config.to_dict().get('is_encoder_decoder', False):
|
||||||
|
LoaderClass = AutoModelForSeq2SeqLM
|
||||||
|
shared.is_seq2seq = True
|
||||||
|
else:
|
||||||
|
LoaderClass = AutoModelForCausalLM
|
||||||
|
|
||||||
|
# Determine if we should use default loading
|
||||||
|
should_use_default_loading = not any([
|
||||||
|
shared.args.cpu,
|
||||||
|
shared.args.load_in_8bit,
|
||||||
|
shared.args.load_in_4bit,
|
||||||
|
shared.args.disk,
|
||||||
|
shared.args.deepspeed,
|
||||||
|
shared.args.cpu_memory is not None,
|
||||||
|
shared.args.compress_pos_emb > 1,
|
||||||
|
shared.args.alpha_value > 1,
|
||||||
|
])
|
||||||
|
|
||||||
|
# Load the model without any special settings
|
||||||
|
if should_use_default_loading:
|
||||||
|
params['device_map'] = 'auto'
|
||||||
|
|
||||||
|
logger.info("TRANSFORMERS_PARAMS=")
|
||||||
|
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
|
||||||
|
print()
|
||||||
|
|
||||||
|
model = LoaderClass.from_pretrained(path_to_model, **params)
|
||||||
|
if not (hasattr(model, 'is_loaded_in_4bit') and model.is_loaded_in_4bit):
|
||||||
|
device = get_device()
|
||||||
|
if device:
|
||||||
|
model = model.to(device)
|
||||||
|
|
||||||
|
# DeepSpeed ZeRO-3
|
||||||
|
elif shared.args.deepspeed:
|
||||||
|
model = LoaderClass.from_pretrained(
|
||||||
|
path_to_model,
|
||||||
|
torch_dtype=params['torch_dtype'],
|
||||||
|
trust_remote_code=params.get('trust_remote_code')
|
||||||
|
)
|
||||||
|
|
||||||
|
model = deepspeed.initialize(
|
||||||
|
model=model,
|
||||||
|
config_params=ds_config,
|
||||||
|
model_parameters=None,
|
||||||
|
optimizer=None,
|
||||||
|
lr_scheduler=None
|
||||||
|
)[0]
|
||||||
|
|
||||||
|
model.module.eval() # Inference
|
||||||
|
logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')
|
||||||
|
|
||||||
|
# Load with quantization and/or offloading
|
||||||
|
else:
|
||||||
|
if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())):
|
||||||
|
logger.warning('torch.cuda.is_available() and is_xpu_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.')
|
||||||
|
shared.args.cpu = True
|
||||||
|
|
||||||
|
if shared.args.cpu:
|
||||||
|
params['torch_dtype'] = torch.float32
|
||||||
|
else:
|
||||||
|
params['device_map'] = 'auto'
|
||||||
|
if x := get_max_memory_dict():
|
||||||
|
params['max_memory'] = x
|
||||||
|
|
||||||
|
if shared.args.load_in_4bit:
|
||||||
|
# See https://github.com/huggingface/transformers/pull/23479/files
|
||||||
|
# and https://huggingface.co/blog/4bit-transformers-bitsandbytes
|
||||||
|
quantization_config_params = {
|
||||||
|
'load_in_4bit': True,
|
||||||
|
'bnb_4bit_compute_dtype': eval(f"torch.{shared.args.compute_dtype}") if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
|
||||||
|
'bnb_4bit_quant_type': shared.args.quant_type,
|
||||||
|
'bnb_4bit_use_double_quant': shared.args.use_double_quant,
|
||||||
|
'llm_int8_enable_fp32_cpu_offload': True
|
||||||
|
}
|
||||||
|
params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)
|
||||||
|
|
||||||
|
elif shared.args.load_in_8bit:
|
||||||
|
if shared.args.gpu_split:
|
||||||
|
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
|
||||||
|
else:
|
||||||
|
params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True)
|
||||||
|
|
||||||
|
if params.get('max_memory') is not None:
|
||||||
|
with init_empty_weights():
|
||||||
|
model = LoaderClass.from_config(config, trust_remote_code=params.get('trust_remote_code'))
|
||||||
|
|
||||||
|
model.tie_weights()
|
||||||
|
params['device_map'] = infer_auto_device_map(
|
||||||
|
model,
|
||||||
|
dtype=torch.int8,
|
||||||
|
max_memory=params.get('max_memory'),
|
||||||
|
no_split_module_classes=model._no_split_modules
|
||||||
|
)
|
||||||
|
|
||||||
|
if shared.args.disk:
|
||||||
|
params['offload_folder'] = shared.args.disk_cache_dir
|
||||||
|
|
||||||
|
if shared.args.compress_pos_emb > 1:
|
||||||
|
params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
|
||||||
|
elif shared.args.alpha_value > 1:
|
||||||
|
params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
|
||||||
|
|
||||||
|
logger.info("TRANSFORMERS_PARAMS=")
|
||||||
|
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
|
||||||
|
print()
|
||||||
|
model = LoaderClass.from_pretrained(path_to_model, **params)
|
||||||
|
|
||||||
|
if shared.args.torch_compile:
|
||||||
|
model = torch.compile(model)
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def get_max_memory_dict():
|
||||||
|
max_memory = {}
|
||||||
|
if shared.args.cpu_memory > 0:
|
||||||
|
max_memory['cpu'] = f'{shared.args.cpu_memory}GiB'
|
||||||
|
|
||||||
|
if shared.args.gpu_split:
|
||||||
|
for i, memory in enumerate(shared.args.gpu_split.split(',')):
|
||||||
|
max_memory[i] = f'{memory}GiB'
|
||||||
|
|
||||||
|
return max_memory if len(max_memory) > 0 else None
|
|
@ -2,9 +2,7 @@ import copy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
import torch
|
|
||||||
import yaml
|
import yaml
|
||||||
from transformers import is_torch_xpu_available
|
|
||||||
|
|
||||||
import extensions
|
import extensions
|
||||||
from modules import shared
|
from modules import shared
|
||||||
|
@ -128,7 +126,6 @@ def list_model_elements():
|
||||||
'torch_compile',
|
'torch_compile',
|
||||||
'flash_attn',
|
'flash_attn',
|
||||||
'use_flash_attention_2',
|
'use_flash_attention_2',
|
||||||
'auto_devices',
|
|
||||||
'cpu',
|
'cpu',
|
||||||
'disk',
|
'disk',
|
||||||
'row_split',
|
'row_split',
|
||||||
|
@ -150,13 +147,6 @@ def list_model_elements():
|
||||||
'no_use_fast',
|
'no_use_fast',
|
||||||
]
|
]
|
||||||
|
|
||||||
if is_torch_xpu_available():
|
|
||||||
for i in range(torch.xpu.device_count()):
|
|
||||||
elements.append(f'gpu_memory_{i}')
|
|
||||||
else:
|
|
||||||
for i in range(torch.cuda.device_count()):
|
|
||||||
elements.append(f'gpu_memory_{i}')
|
|
||||||
|
|
||||||
return elements
|
return elements
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,9 @@
|
||||||
import importlib
|
import importlib
|
||||||
import math
|
|
||||||
import re
|
|
||||||
import traceback
|
import traceback
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
import psutil
|
|
||||||
import torch
|
|
||||||
from transformers import is_torch_npu_available, is_torch_xpu_available
|
|
||||||
|
|
||||||
from modules import loaders, shared, ui, utils
|
from modules import loaders, shared, ui, utils
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
|
@ -27,35 +22,6 @@ from modules.utils import gradio
|
||||||
def create_ui():
|
def create_ui():
|
||||||
mu = shared.args.multi_user
|
mu = shared.args.multi_user
|
||||||
|
|
||||||
# Finding the default values for the GPU and CPU memories
|
|
||||||
total_mem = []
|
|
||||||
if is_torch_xpu_available():
|
|
||||||
for i in range(torch.xpu.device_count()):
|
|
||||||
total_mem.append(math.floor(torch.xpu.get_device_properties(i).total_memory / (1024 * 1024)))
|
|
||||||
elif is_torch_npu_available():
|
|
||||||
for i in range(torch.npu.device_count()):
|
|
||||||
total_mem.append(math.floor(torch.npu.get_device_properties(i).total_memory / (1024 * 1024)))
|
|
||||||
else:
|
|
||||||
for i in range(torch.cuda.device_count()):
|
|
||||||
total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))
|
|
||||||
|
|
||||||
default_gpu_mem = []
|
|
||||||
if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
|
|
||||||
for i in shared.args.gpu_memory:
|
|
||||||
if 'mib' in i.lower():
|
|
||||||
default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
|
|
||||||
else:
|
|
||||||
default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
|
|
||||||
|
|
||||||
while len(default_gpu_mem) < len(total_mem):
|
|
||||||
default_gpu_mem.append(0)
|
|
||||||
|
|
||||||
total_cpu_mem = math.floor(psutil.virtual_memory().total / (1024 * 1024))
|
|
||||||
if shared.args.cpu_memory is not None:
|
|
||||||
default_cpu_mem = re.sub('[a-zA-Z ]', '', shared.args.cpu_memory)
|
|
||||||
else:
|
|
||||||
default_cpu_mem = 0
|
|
||||||
|
|
||||||
with gr.Tab("Model", elem_id="model-tab"):
|
with gr.Tab("Model", elem_id="model-tab"):
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
|
@ -80,10 +46,6 @@ def create_ui():
|
||||||
with gr.Blocks():
|
with gr.Blocks():
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
for i in range(len(total_mem)):
|
|
||||||
shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i])
|
|
||||||
|
|
||||||
shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem)
|
|
||||||
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
|
shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
|
||||||
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
|
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
|
||||||
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
|
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
|
||||||
|
@ -94,6 +56,7 @@ def create_ui():
|
||||||
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
|
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
|
||||||
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
|
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
|
||||||
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
|
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
|
||||||
|
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
|
||||||
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
|
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
|
||||||
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
|
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
|
||||||
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
|
shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
|
||||||
|
@ -107,7 +70,6 @@ def create_ui():
|
||||||
shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
|
shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
|
||||||
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
|
shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
|
||||||
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
|
shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
|
||||||
shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
|
|
||||||
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
|
shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
|
||||||
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
|
shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
|
||||||
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
|
shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
|
||||||
|
|
195
one_click.py
195
one_click.py
|
@ -15,7 +15,6 @@ import sys
|
||||||
# os.environ["HSA_OVERRIDE_GFX_VERSION"] = '10.3.0'
|
# os.environ["HSA_OVERRIDE_GFX_VERSION"] = '10.3.0'
|
||||||
# os.environ["HCC_AMDGPU_TARGET"] = 'gfx1030'
|
# os.environ["HCC_AMDGPU_TARGET"] = 'gfx1030'
|
||||||
|
|
||||||
|
|
||||||
# Define the required versions
|
# Define the required versions
|
||||||
TORCH_VERSION = "2.6.0"
|
TORCH_VERSION = "2.6.0"
|
||||||
TORCHVISION_VERSION = "0.21.0"
|
TORCHVISION_VERSION = "0.21.0"
|
||||||
|
@ -62,6 +61,19 @@ def is_x86_64():
|
||||||
return platform.machine() == "x86_64"
|
return platform.machine() == "x86_64"
|
||||||
|
|
||||||
|
|
||||||
|
def is_installed():
|
||||||
|
site_packages_path = None
|
||||||
|
for sitedir in site.getsitepackages():
|
||||||
|
if "site-packages" in sitedir and conda_env_path in sitedir:
|
||||||
|
site_packages_path = sitedir
|
||||||
|
break
|
||||||
|
|
||||||
|
if site_packages_path:
|
||||||
|
return os.path.isfile(os.path.join(site_packages_path, 'torch', '__init__.py'))
|
||||||
|
else:
|
||||||
|
return os.path.isdir(conda_env_path)
|
||||||
|
|
||||||
|
|
||||||
def cpu_has_avx2():
|
def cpu_has_avx2():
|
||||||
try:
|
try:
|
||||||
import cpuinfo
|
import cpuinfo
|
||||||
|
@ -104,44 +116,13 @@ def torch_version():
|
||||||
return torver
|
return torver
|
||||||
|
|
||||||
|
|
||||||
def update_pytorch_and_python():
|
def get_current_commit():
|
||||||
print_big_message("Checking for PyTorch updates.")
|
result = run_cmd("git rev-parse HEAD", capture_output=True, environment=True)
|
||||||
|
return result.stdout.decode('utf-8').strip()
|
||||||
# Update the Python version. Left here for future reference in case this becomes necessary.
|
|
||||||
# print_big_message("Checking for PyTorch and Python updates.")
|
|
||||||
# current_python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
|
|
||||||
# if current_python_version != PYTHON_VERSION:
|
|
||||||
# run_cmd(f"conda install -y python={PYTHON_VERSION}", assert_success=True, environment=True)
|
|
||||||
|
|
||||||
torver = torch_version()
|
|
||||||
base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}"
|
|
||||||
|
|
||||||
if "+cu" in torver:
|
|
||||||
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
|
|
||||||
elif "+rocm" in torver:
|
|
||||||
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
|
|
||||||
elif "+cpu" in torver:
|
|
||||||
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
|
|
||||||
elif "+cxx11" in torver:
|
|
||||||
intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
|
|
||||||
install_cmd = f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
|
|
||||||
else:
|
|
||||||
install_cmd = base_cmd
|
|
||||||
|
|
||||||
run_cmd(install_cmd, assert_success=True, environment=True)
|
|
||||||
|
|
||||||
|
|
||||||
def is_installed():
|
def get_extensions_names():
|
||||||
site_packages_path = None
|
return [foldername for foldername in os.listdir('extensions') if os.path.isfile(os.path.join('extensions', foldername, 'requirements.txt'))]
|
||||||
for sitedir in site.getsitepackages():
|
|
||||||
if "site-packages" in sitedir and conda_env_path in sitedir:
|
|
||||||
site_packages_path = sitedir
|
|
||||||
break
|
|
||||||
|
|
||||||
if site_packages_path:
|
|
||||||
return os.path.isfile(os.path.join(site_packages_path, 'torch', '__init__.py'))
|
|
||||||
else:
|
|
||||||
return os.path.isdir(conda_env_path)
|
|
||||||
|
|
||||||
|
|
||||||
def check_env():
|
def check_env():
|
||||||
|
@ -157,35 +138,11 @@ def check_env():
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
def get_current_commit():
|
|
||||||
result = run_cmd("git rev-parse HEAD", capture_output=True, environment=True)
|
|
||||||
return result.stdout.decode('utf-8').strip()
|
|
||||||
|
|
||||||
|
|
||||||
def clear_cache():
|
def clear_cache():
|
||||||
run_cmd("conda clean -a -y", environment=True)
|
run_cmd("conda clean -a -y", environment=True)
|
||||||
run_cmd("python -m pip cache purge", environment=True)
|
run_cmd("python -m pip cache purge", environment=True)
|
||||||
|
|
||||||
|
|
||||||
def print_big_message(message):
|
|
||||||
message = message.strip()
|
|
||||||
lines = message.split('\n')
|
|
||||||
print("\n\n*******************************************************************")
|
|
||||||
for line in lines:
|
|
||||||
print("*", line)
|
|
||||||
|
|
||||||
print("*******************************************************************\n\n")
|
|
||||||
|
|
||||||
|
|
||||||
def calculate_file_hash(file_path):
|
|
||||||
p = os.path.join(script_dir, file_path)
|
|
||||||
if os.path.isfile(p):
|
|
||||||
with open(p, 'rb') as f:
|
|
||||||
return hashlib.sha256(f.read()).hexdigest()
|
|
||||||
else:
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def run_cmd(cmd, assert_success=False, environment=False, capture_output=False, env=None):
|
def run_cmd(cmd, assert_success=False, environment=False, capture_output=False, env=None):
|
||||||
# Use the conda environment
|
# Use the conda environment
|
||||||
if environment:
|
if environment:
|
||||||
|
@ -210,6 +167,25 @@ def run_cmd(cmd, assert_success=False, environment=False, capture_output=False,
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def print_big_message(message):
|
||||||
|
message = message.strip()
|
||||||
|
lines = message.split('\n')
|
||||||
|
print("\n\n*******************************************************************")
|
||||||
|
for line in lines:
|
||||||
|
print("*", line)
|
||||||
|
|
||||||
|
print("*******************************************************************\n\n")
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_file_hash(file_path):
|
||||||
|
p = os.path.join(script_dir, file_path)
|
||||||
|
if os.path.isfile(p):
|
||||||
|
with open(p, 'rb') as f:
|
||||||
|
return hashlib.sha256(f.read()).hexdigest()
|
||||||
|
else:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
def generate_alphabetic_sequence(index):
|
def generate_alphabetic_sequence(index):
|
||||||
result = ''
|
result = ''
|
||||||
while index >= 0:
|
while index >= 0:
|
||||||
|
@ -238,6 +214,51 @@ def get_user_choice(question, options_dict):
|
||||||
return choice
|
return choice
|
||||||
|
|
||||||
|
|
||||||
|
def update_pytorch_and_python():
|
||||||
|
print_big_message("Checking for PyTorch updates.")
|
||||||
|
|
||||||
|
# Update the Python version. Left here for future reference in case this becomes necessary.
|
||||||
|
# print_big_message("Checking for PyTorch and Python updates.")
|
||||||
|
# current_python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
|
||||||
|
# if current_python_version != PYTHON_VERSION:
|
||||||
|
# run_cmd(f"conda install -y python={PYTHON_VERSION}", assert_success=True, environment=True)
|
||||||
|
|
||||||
|
torver = torch_version()
|
||||||
|
base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}"
|
||||||
|
|
||||||
|
if "+cu" in torver:
|
||||||
|
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
|
||||||
|
elif "+rocm" in torver:
|
||||||
|
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
|
||||||
|
elif "+cpu" in torver:
|
||||||
|
install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
|
||||||
|
elif "+cxx11" in torver:
|
||||||
|
intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
|
||||||
|
install_cmd = f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
|
||||||
|
else:
|
||||||
|
install_cmd = base_cmd
|
||||||
|
|
||||||
|
run_cmd(install_cmd, assert_success=True, environment=True)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_outdated_pytorch_cuda_dependencies():
|
||||||
|
patterns = ["cu121", "cu122", "torch2.4"]
|
||||||
|
result = run_cmd("python -m pip list --format=freeze", capture_output=True, environment=True)
|
||||||
|
matching_packages = []
|
||||||
|
|
||||||
|
for line in result.stdout.decode('utf-8').splitlines():
|
||||||
|
if "==" in line:
|
||||||
|
pkg_name, version = line.split('==', 1)
|
||||||
|
if any(pattern in version for pattern in patterns):
|
||||||
|
matching_packages.append(pkg_name)
|
||||||
|
|
||||||
|
if matching_packages:
|
||||||
|
print(f"\nUninstalling: {', '.join(matching_packages)}\n")
|
||||||
|
run_cmd(f"python -m pip uninstall -y {' '.join(matching_packages)}", assert_success=True, environment=True)
|
||||||
|
|
||||||
|
return matching_packages
|
||||||
|
|
||||||
|
|
||||||
def install_webui():
|
def install_webui():
|
||||||
if os.path.isfile(state_file):
|
if os.path.isfile(state_file):
|
||||||
os.remove(state_file)
|
os.remove(state_file)
|
||||||
|
@ -323,37 +344,6 @@ def install_webui():
|
||||||
update_requirements(initial_installation=True, pull=False)
|
update_requirements(initial_installation=True, pull=False)
|
||||||
|
|
||||||
|
|
||||||
def get_extensions_names():
|
|
||||||
return [foldername for foldername in os.listdir('extensions') if os.path.isfile(os.path.join('extensions', foldername, 'requirements.txt'))]
|
|
||||||
|
|
||||||
|
|
||||||
def install_extensions_requirements():
|
|
||||||
print_big_message("Installing extensions requirements.\nSome of these may fail on Windows.\nDon\'t worry if you see error messages, as they will not affect the main program.")
|
|
||||||
extensions = get_extensions_names()
|
|
||||||
for i, extension in enumerate(extensions):
|
|
||||||
print(f"\n\n--- [{i + 1}/{len(extensions)}]: {extension}\n\n")
|
|
||||||
extension_req_path = os.path.join("extensions", extension, "requirements.txt")
|
|
||||||
run_cmd(f"python -m pip install -r {extension_req_path} --upgrade", assert_success=False, environment=True)
|
|
||||||
|
|
||||||
|
|
||||||
def clean_outdated_pytorch_cuda_dependencies():
|
|
||||||
patterns = ["cu121", "cu122", "torch2.4"]
|
|
||||||
result = run_cmd("python -m pip list --format=freeze", capture_output=True, environment=True)
|
|
||||||
matching_packages = []
|
|
||||||
|
|
||||||
for line in result.stdout.decode('utf-8').splitlines():
|
|
||||||
if "==" in line:
|
|
||||||
pkg_name, version = line.split('==', 1)
|
|
||||||
if any(pattern in version for pattern in patterns):
|
|
||||||
matching_packages.append(pkg_name)
|
|
||||||
|
|
||||||
if matching_packages:
|
|
||||||
print(f"\nUninstalling: {', '.join(matching_packages)}\n")
|
|
||||||
run_cmd(f"python -m pip uninstall -y {' '.join(matching_packages)}", assert_success=True, environment=True)
|
|
||||||
|
|
||||||
return matching_packages
|
|
||||||
|
|
||||||
|
|
||||||
def update_requirements(initial_installation=False, pull=True):
|
def update_requirements(initial_installation=False, pull=True):
|
||||||
# Create .git directory if missing
|
# Create .git directory if missing
|
||||||
if not os.path.exists(os.path.join(script_dir, ".git")):
|
if not os.path.exists(os.path.join(script_dir, ".git")):
|
||||||
|
@ -366,14 +356,18 @@ def update_requirements(initial_installation=False, pull=True):
|
||||||
)
|
)
|
||||||
|
|
||||||
torver = torch_version()
|
torver = torch_version()
|
||||||
|
requirements_base = os.path.join("requirements", "full")
|
||||||
|
|
||||||
if "+rocm" in torver:
|
if "+rocm" in torver:
|
||||||
requirements_file = "requirements_amd" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt"
|
file_name = f"requirements_amd{'_noavx2' if not cpu_has_avx2() else ''}.txt"
|
||||||
elif "+cpu" in torver or "+cxx11" in torver:
|
elif "+cpu" in torver or "+cxx11" in torver:
|
||||||
requirements_file = "requirements_cpu_only" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt"
|
file_name = f"requirements_cpu_only{'_noavx2' if not cpu_has_avx2() else ''}.txt"
|
||||||
elif is_macos():
|
elif is_macos():
|
||||||
requirements_file = "requirements_apple_" + ("intel" if is_x86_64() else "silicon") + ".txt"
|
file_name = f"requirements_apple_{'intel' if is_x86_64() else 'silicon'}.txt"
|
||||||
else:
|
else:
|
||||||
requirements_file = "requirements" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt"
|
file_name = f"requirements{'_noavx2' if not cpu_has_avx2() else ''}.txt"
|
||||||
|
|
||||||
|
requirements_file = os.path.join(requirements_base, file_name)
|
||||||
|
|
||||||
# Load state from JSON file
|
# Load state from JSON file
|
||||||
current_commit = get_current_commit()
|
current_commit = get_current_commit()
|
||||||
|
@ -475,6 +469,15 @@ def update_requirements(initial_installation=False, pull=True):
|
||||||
clear_cache()
|
clear_cache()
|
||||||
|
|
||||||
|
|
||||||
|
def install_extensions_requirements():
|
||||||
|
print_big_message("Installing extensions requirements.\nSome of these may fail on Windows.\nDon\'t worry if you see error messages, as they will not affect the main program.")
|
||||||
|
extensions = get_extensions_names()
|
||||||
|
for i, extension in enumerate(extensions):
|
||||||
|
print(f"\n\n--- [{i + 1}/{len(extensions)}]: {extension}\n\n")
|
||||||
|
extension_req_path = os.path.join("extensions", extension, "requirements.txt")
|
||||||
|
run_cmd(f"python -m pip install -r {extension_req_path} --upgrade", assert_success=False, environment=True)
|
||||||
|
|
||||||
|
|
||||||
def launch_webui():
|
def launch_webui():
|
||||||
run_cmd(f"python server.py {flags}", environment=True)
|
run_cmd(f"python server.py {flags}", environment=True)
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,6 @@ fastapi==0.112.4
|
||||||
gradio==4.37.*
|
gradio==4.37.*
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numba==0.59.*
|
|
||||||
numpy==1.26.*
|
numpy==1.26.*
|
||||||
pandas
|
pandas
|
||||||
peft==0.15.*
|
peft==0.15.*
|
||||||
|
@ -26,14 +25,13 @@ tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
# API
|
# API
|
||||||
SpeechRecognition==3.10.0
|
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
|
@ -6,7 +6,6 @@ fastapi==0.112.4
|
||||||
gradio==4.37.*
|
gradio==4.37.*
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numba==0.59.*
|
|
||||||
numpy==1.26.*
|
numpy==1.26.*
|
||||||
pandas
|
pandas
|
||||||
peft==0.15.*
|
peft==0.15.*
|
||||||
|
@ -25,12 +24,11 @@ tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
# API
|
# API
|
||||||
SpeechRecognition==3.10.0
|
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# AMD wheels
|
# AMD wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/rocm/llama_cpp_binaries-0.2.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
|
@ -6,7 +6,6 @@ fastapi==0.112.4
|
||||||
gradio==4.37.*
|
gradio==4.37.*
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numba==0.59.*
|
|
||||||
numpy==1.26.*
|
numpy==1.26.*
|
||||||
pandas
|
pandas
|
||||||
peft==0.15.*
|
peft==0.15.*
|
||||||
|
@ -25,12 +24,11 @@ tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
# API
|
# API
|
||||||
SpeechRecognition==3.10.0
|
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# AMD wheels
|
# AMD wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/rocm/llama_cpp_binaries-0.2.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
|
|
@ -6,7 +6,6 @@ fastapi==0.112.4
|
||||||
gradio==4.37.*
|
gradio==4.37.*
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numba==0.59.*
|
|
||||||
numpy==1.26.*
|
numpy==1.26.*
|
||||||
pandas
|
pandas
|
||||||
peft==0.15.*
|
peft==0.15.*
|
||||||
|
@ -25,14 +24,12 @@ tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
# API
|
# API
|
||||||
SpeechRecognition==3.10.0
|
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# Mac wheels
|
# Mac wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
|
|
||||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
|
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
|
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
|
|
@ -6,7 +6,6 @@ fastapi==0.112.4
|
||||||
gradio==4.37.*
|
gradio==4.37.*
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numba==0.59.*
|
|
||||||
numpy==1.26.*
|
numpy==1.26.*
|
||||||
pandas
|
pandas
|
||||||
peft==0.15.*
|
peft==0.15.*
|
||||||
|
@ -25,13 +24,13 @@ tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
# API
|
# API
|
||||||
SpeechRecognition==3.10.0
|
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# Mac wheels
|
# Mac wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
|
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
|
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
|
|
@ -6,7 +6,6 @@ fastapi==0.112.4
|
||||||
gradio==4.37.*
|
gradio==4.37.*
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numba==0.59.*
|
|
||||||
numpy==1.26.*
|
numpy==1.26.*
|
||||||
pandas
|
pandas
|
||||||
peft==0.15.*
|
peft==0.15.*
|
||||||
|
@ -25,11 +24,10 @@ tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
# API
|
# API
|
||||||
SpeechRecognition==3.10.0
|
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# llama.cpp (CPU only, AVX2)
|
# llama.cpp (CPU only, AVX2)
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
|
@ -6,7 +6,6 @@ fastapi==0.112.4
|
||||||
gradio==4.37.*
|
gradio==4.37.*
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numba==0.59.*
|
|
||||||
numpy==1.26.*
|
numpy==1.26.*
|
||||||
pandas
|
pandas
|
||||||
peft==0.15.*
|
peft==0.15.*
|
||||||
|
@ -25,11 +24,10 @@ tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
# API
|
# API
|
||||||
SpeechRecognition==3.10.0
|
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# llama.cpp (CPU only, no AVX2)
|
# llama.cpp (CPU only, no AVX2)
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
|
@ -7,7 +7,6 @@ fastapi==0.112.4
|
||||||
gradio==4.37.*
|
gradio==4.37.*
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numba==0.59.*
|
|
||||||
numpy==1.26.*
|
numpy==1.26.*
|
||||||
pandas
|
pandas
|
||||||
peft==0.15.*
|
peft==0.15.*
|
||||||
|
@ -26,14 +25,13 @@ tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
# API
|
# API
|
||||||
SpeechRecognition==3.10.0
|
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
||||||
|
|
||||||
# CUDA wheels
|
# CUDA wheels
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
|
@ -6,7 +6,6 @@ fastapi==0.112.4
|
||||||
gradio==4.37.*
|
gradio==4.37.*
|
||||||
jinja2==3.1.6
|
jinja2==3.1.6
|
||||||
markdown
|
markdown
|
||||||
numba==0.59.*
|
|
||||||
numpy==1.26.*
|
numpy==1.26.*
|
||||||
pandas
|
pandas
|
||||||
peft==0.15.*
|
peft==0.15.*
|
||||||
|
@ -25,7 +24,6 @@ tqdm
|
||||||
wandb
|
wandb
|
||||||
|
|
||||||
# API
|
# API
|
||||||
SpeechRecognition==3.10.0
|
|
||||||
flask_cloudflared==0.0.14
|
flask_cloudflared==0.0.14
|
||||||
sse-starlette==1.6.5
|
sse-starlette==1.6.5
|
||||||
tiktoken
|
tiktoken
|
19
requirements/portable/requirements.txt
Normal file
19
requirements/portable/requirements.txt
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
fastapi==0.112.4
|
||||||
|
gradio==4.37.*
|
||||||
|
jinja2==3.1.6
|
||||||
|
markdown
|
||||||
|
numpy==1.26.*
|
||||||
|
pydantic==2.8.2
|
||||||
|
pyyaml
|
||||||
|
requests
|
||||||
|
rich
|
||||||
|
tqdm
|
||||||
|
|
||||||
|
# API
|
||||||
|
flask_cloudflared==0.0.14
|
||||||
|
sse-starlette==1.6.5
|
||||||
|
tiktoken
|
||||||
|
|
||||||
|
# CUDA wheels
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
18
requirements/portable/requirements_amd.txt
Normal file
18
requirements/portable/requirements_amd.txt
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
fastapi==0.112.4
|
||||||
|
gradio==4.37.*
|
||||||
|
jinja2==3.1.6
|
||||||
|
markdown
|
||||||
|
numpy==1.26.*
|
||||||
|
pydantic==2.8.2
|
||||||
|
pyyaml
|
||||||
|
requests
|
||||||
|
rich
|
||||||
|
tqdm
|
||||||
|
|
||||||
|
# API
|
||||||
|
flask_cloudflared==0.0.14
|
||||||
|
sse-starlette==1.6.5
|
||||||
|
tiktoken
|
||||||
|
|
||||||
|
# AMD wheels
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
18
requirements/portable/requirements_amd_noavx2.txt
Normal file
18
requirements/portable/requirements_amd_noavx2.txt
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
fastapi==0.112.4
|
||||||
|
gradio==4.37.*
|
||||||
|
jinja2==3.1.6
|
||||||
|
markdown
|
||||||
|
numpy==1.26.*
|
||||||
|
pydantic==2.8.2
|
||||||
|
pyyaml
|
||||||
|
requests
|
||||||
|
rich
|
||||||
|
tqdm
|
||||||
|
|
||||||
|
# API
|
||||||
|
flask_cloudflared==0.0.14
|
||||||
|
sse-starlette==1.6.5
|
||||||
|
tiktoken
|
||||||
|
|
||||||
|
# AMD wheels
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
19
requirements/portable/requirements_apple_intel.txt
Normal file
19
requirements/portable/requirements_apple_intel.txt
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
fastapi==0.112.4
|
||||||
|
gradio==4.37.*
|
||||||
|
jinja2==3.1.6
|
||||||
|
markdown
|
||||||
|
numpy==1.26.*
|
||||||
|
pydantic==2.8.2
|
||||||
|
pyyaml
|
||||||
|
requests
|
||||||
|
rich
|
||||||
|
tqdm
|
||||||
|
|
||||||
|
# API
|
||||||
|
flask_cloudflared==0.0.14
|
||||||
|
sse-starlette==1.6.5
|
||||||
|
tiktoken
|
||||||
|
|
||||||
|
# Mac wheels
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
20
requirements/portable/requirements_apple_silicon.txt
Normal file
20
requirements/portable/requirements_apple_silicon.txt
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
fastapi==0.112.4
|
||||||
|
gradio==4.37.*
|
||||||
|
jinja2==3.1.6
|
||||||
|
markdown
|
||||||
|
numpy==1.26.*
|
||||||
|
pydantic==2.8.2
|
||||||
|
pyyaml
|
||||||
|
requests
|
||||||
|
rich
|
||||||
|
tqdm
|
||||||
|
|
||||||
|
# API
|
||||||
|
flask_cloudflared==0.0.14
|
||||||
|
sse-starlette==1.6.5
|
||||||
|
tiktoken
|
||||||
|
|
||||||
|
# Mac wheels
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
|
19
requirements/portable/requirements_cpu_only.txt
Normal file
19
requirements/portable/requirements_cpu_only.txt
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
fastapi==0.112.4
|
||||||
|
gradio==4.37.*
|
||||||
|
jinja2==3.1.6
|
||||||
|
markdown
|
||||||
|
numpy==1.26.*
|
||||||
|
pydantic==2.8.2
|
||||||
|
pyyaml
|
||||||
|
requests
|
||||||
|
rich
|
||||||
|
tqdm
|
||||||
|
|
||||||
|
# API
|
||||||
|
flask_cloudflared==0.0.14
|
||||||
|
sse-starlette==1.6.5
|
||||||
|
tiktoken
|
||||||
|
|
||||||
|
# llama.cpp (CPU only, AVX2)
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
19
requirements/portable/requirements_cpu_only_noavx2.txt
Normal file
19
requirements/portable/requirements_cpu_only_noavx2.txt
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
fastapi==0.112.4
|
||||||
|
gradio==4.37.*
|
||||||
|
jinja2==3.1.6
|
||||||
|
markdown
|
||||||
|
numpy==1.26.*
|
||||||
|
pydantic==2.8.2
|
||||||
|
pyyaml
|
||||||
|
requests
|
||||||
|
rich
|
||||||
|
tqdm
|
||||||
|
|
||||||
|
# API
|
||||||
|
flask_cloudflared==0.0.14
|
||||||
|
sse-starlette==1.6.5
|
||||||
|
tiktoken
|
||||||
|
|
||||||
|
# llama.cpp (CPU only, no AVX2)
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
19
requirements/portable/requirements_noavx2.txt
Normal file
19
requirements/portable/requirements_noavx2.txt
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
fastapi==0.112.4
|
||||||
|
gradio==4.37.*
|
||||||
|
jinja2==3.1.6
|
||||||
|
markdown
|
||||||
|
numpy==1.26.*
|
||||||
|
pydantic==2.8.2
|
||||||
|
pyyaml
|
||||||
|
requests
|
||||||
|
rich
|
||||||
|
tqdm
|
||||||
|
|
||||||
|
# API
|
||||||
|
flask_cloudflared==0.0.14
|
||||||
|
sse-starlette==1.6.5
|
||||||
|
tiktoken
|
||||||
|
|
||||||
|
# CUDA wheels
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
|
||||||
|
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
|
15
requirements/portable/requirements_nowheels.txt
Normal file
15
requirements/portable/requirements_nowheels.txt
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
fastapi==0.112.4
|
||||||
|
gradio==4.37.*
|
||||||
|
jinja2==3.1.6
|
||||||
|
markdown
|
||||||
|
numpy==1.26.*
|
||||||
|
pydantic==2.8.2
|
||||||
|
pyyaml
|
||||||
|
requests
|
||||||
|
rich
|
||||||
|
tqdm
|
||||||
|
|
||||||
|
# API
|
||||||
|
flask_cloudflared==0.0.14
|
||||||
|
sse-starlette==1.6.5
|
||||||
|
tiktoken
|
26
server.py
26
server.py
|
@ -1,11 +1,8 @@
|
||||||
import os
|
import os
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from modules import shared
|
|
||||||
|
|
||||||
import accelerate # This early import makes Intel GPUs happy
|
|
||||||
|
|
||||||
import modules.one_click_installer_check
|
import modules.one_click_installer_check
|
||||||
|
from modules import shared
|
||||||
from modules.block_requests import OpenMonkeyPatch, RequestBlocker
|
from modules.block_requests import OpenMonkeyPatch, RequestBlocker
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
|
|
||||||
|
@ -38,7 +35,6 @@ import yaml
|
||||||
|
|
||||||
import modules.extensions as extensions_module
|
import modules.extensions as extensions_module
|
||||||
from modules import (
|
from modules import (
|
||||||
chat,
|
|
||||||
training,
|
training,
|
||||||
ui,
|
ui,
|
||||||
ui_chat,
|
ui_chat,
|
||||||
|
@ -89,7 +85,7 @@ def create_interface():
|
||||||
|
|
||||||
# Force some events to be triggered on page load
|
# Force some events to be triggered on page load
|
||||||
shared.persistent_interface_state.update({
|
shared.persistent_interface_state.update({
|
||||||
'loader': shared.args.loader or 'Transformers',
|
'loader': shared.args.loader or 'llama.cpp',
|
||||||
'mode': shared.settings['mode'] if shared.settings['mode'] == 'instruct' else gr.update(),
|
'mode': shared.settings['mode'] if shared.settings['mode'] == 'instruct' else gr.update(),
|
||||||
'character_menu': shared.args.character or shared.settings['character'],
|
'character_menu': shared.args.character or shared.settings['character'],
|
||||||
'instruction_template_str': shared.settings['instruction_template_str'],
|
'instruction_template_str': shared.settings['instruction_template_str'],
|
||||||
|
@ -218,10 +214,28 @@ if __name__ == "__main__":
|
||||||
if extension not in shared.args.extensions:
|
if extension not in shared.args.extensions:
|
||||||
shared.args.extensions.append(extension)
|
shared.args.extensions.append(extension)
|
||||||
|
|
||||||
|
available_models = utils.get_available_models()
|
||||||
|
|
||||||
# Model defined through --model
|
# Model defined through --model
|
||||||
if shared.args.model is not None:
|
if shared.args.model is not None:
|
||||||
shared.model_name = shared.args.model
|
shared.model_name = shared.args.model
|
||||||
|
|
||||||
|
# Select the model from a command-line menu
|
||||||
|
elif shared.args.model_menu:
|
||||||
|
if len(available_models) == 0:
|
||||||
|
logger.error('No models are available! Please download at least one.')
|
||||||
|
sys.exit(0)
|
||||||
|
else:
|
||||||
|
print('The following models are available:\n')
|
||||||
|
for i, model in enumerate(available_models):
|
||||||
|
print(f'{i+1}. {model}')
|
||||||
|
|
||||||
|
print(f'\nWhich one do you want to load? 1-{len(available_models)}\n')
|
||||||
|
i = int(input()) - 1
|
||||||
|
print()
|
||||||
|
|
||||||
|
shared.model_name = available_models[i]
|
||||||
|
|
||||||
# If any model has been selected, load it
|
# If any model has been selected, load it
|
||||||
if shared.model_name != 'None':
|
if shared.model_name != 'None':
|
||||||
p = Path(shared.model_name)
|
p = Path(shared.model_name)
|
||||||
|
|
|
@ -2,6 +2,12 @@
|
||||||
|
|
||||||
cd "$(dirname "${BASH_SOURCE[0]}")"
|
cd "$(dirname "${BASH_SOURCE[0]}")"
|
||||||
|
|
||||||
|
# Portable install case
|
||||||
|
if [ -d "portable_env" ]; then
|
||||||
|
./portable_env/bin/python3 server.py --api --auto-launch "$@"
|
||||||
|
exit $?
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
|
if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
|
||||||
|
|
||||||
# deactivate existing conda envs as needed to avoid conflicts
|
# deactivate existing conda envs as needed to avoid conflicts
|
||||||
|
|
|
@ -2,6 +2,12 @@
|
||||||
|
|
||||||
cd "$(dirname "${BASH_SOURCE[0]}")"
|
cd "$(dirname "${BASH_SOURCE[0]}")"
|
||||||
|
|
||||||
|
# Portable install case
|
||||||
|
if [ -d "portable_env" ]; then
|
||||||
|
./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@"
|
||||||
|
exit $?
|
||||||
|
fi
|
||||||
|
|
||||||
if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
|
if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
|
||||||
|
|
||||||
# deactivate existing conda envs as needed to avoid conflicts
|
# deactivate existing conda envs as needed to avoid conflicts
|
||||||
|
|
|
@ -3,6 +3,12 @@ setlocal enabledelayedexpansion
|
||||||
|
|
||||||
cd /D "%~dp0"
|
cd /D "%~dp0"
|
||||||
|
|
||||||
|
@rem Portable install case
|
||||||
|
if exist "portable_env" (
|
||||||
|
.\portable_env\python.exe server.py --api --auto-launch %*
|
||||||
|
exit /b %errorlevel%
|
||||||
|
)
|
||||||
|
|
||||||
set PATH=%PATH%;%SystemRoot%\system32
|
set PATH=%PATH%;%SystemRoot%\system32
|
||||||
|
|
||||||
echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end
|
echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end
|
||||||
|
|
Loading…
Add table
Reference in a new issue