Merge pull request #6869 from oobabooga/dev

Merge dev branch
2025-06-07 14:17:09 -04:00 · 2025-04-22 12:09:20 -03:00 · 2025-04-22 12:09:20 -03:00 · a778270536
commit a778270536
parent c19b995b8e 25cf3600aa
48 changed files with 1292 additions and 705 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -5,8 +5,14 @@
 version: 2
 updates:
-  - package-ecosystem: "pip" # See documentation for possible values
+  - package-ecosystem: "pip"
-    directory: "/" # Location of package manifests
+    directory: "/requirements/full/"
    target-branch: "dev"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/requirements/portable/"
    target-branch: "dev"
    schedule:
      interval: "weekly"
--- a/.github/workflows/build-everything-tgw.yml
+++ b/.github/workflows/build-everything-tgw.yml
@ -0,0 +1,49 @@
 name: Build Everything TGW
 on:
  workflow_dispatch:
    inputs:
      version:
        description: 'Version tag of text-generation-webui to build: v3.0'
        default: 'v3.0'
        required: true
        type: string
 permissions:
  contents: write
 jobs:
  build_release_cuda_windows:
    name: CUDA Windows
    uses: ./.github/workflows/build-portable-release-cuda.yml
    with:
      version: ${{ inputs.version }}
      config: 'os:windows-2019'
  build_release_cuda_linux:
    name: CUDA Linux
    uses: ./.github/workflows/build-portable-release-cuda.yml
    with:
      version: ${{ inputs.version }}
      config: 'os:ubuntu-22.04'
  build_release_cpu_windows:
    name: CPU Windows
    uses: ./.github/workflows/build-portable-release.yml
    with:
      version: ${{ inputs.version }}
      config: 'os:windows-2019'
  build_release_cpu_linux:
    name: CPU Linux
    uses: ./.github/workflows/build-portable-release.yml
    with:
      version: ${{ inputs.version }}
      config: 'os:ubuntu-22.04'
  build_release_macos:
    name: macOS
    uses: ./.github/workflows/build-portable-release.yml
    with:
      version: ${{ inputs.version }}
      config: 'os:macos-13,macos-14'
--- a/.github/workflows/build-portable-release-cuda.yml
+++ b/.github/workflows/build-portable-release-cuda.yml
@ -0,0 +1,183 @@
 name: Build CUDA
 on:
  workflow_dispatch:
    inputs:
      version:
        description: 'Version tag of text-generation-webui to build: v3.0'
        default: 'v3.0'
        required: true
        type: string
      config:
        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
        default: 'Default'
        required: false
        type: string
      exclude:
        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
        default: 'None'
        required: false
        type: string
  workflow_call:
    inputs:
      version:
        description: 'Version tag of text-generation-webui to build: v3.0'
        default: 'v3.0'
        required: true
        type: string
      config:
        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
        default: 'Default'
        required: false
        type: string
      exclude:
        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
        default: 'None'
        required: false
        type: string
 permissions:
  contents: write
 jobs:
  define_matrix:
    name: Define Build Matrix
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    defaults:
      run:
        shell: pwsh
    env:
      CONFIGIN: ${{ inputs.config }}
      EXCLUDEIN: ${{ inputs.exclude }}
    steps:
      - name: Define Job Output
        id: set-matrix
        run: |
          $matrix = @{
              'os' = @('ubuntu-22.04', 'windows-2019')
              'pyver' = @("3.11")
              'avx' = @("AVX2")
              'cuda' = @("11.7", "12.4")
          }
          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
          if ($env:EXCLUDEIN -ne 'None') {
              $exclusions = @()
              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
              $matrix['exclude'] = $exclusions
          }
          $matrixOut = ConvertTo-Json $matrix -Compress
          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
  build_wheels:
    name: ${{ matrix.os }} ${{ matrix.pyver }} CPU ${{ matrix.avx }} CUDA ${{ matrix.cuda }}
    needs: define_matrix
    runs-on: ${{ matrix.os }}
    strategy:
      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
    defaults:
      run:
        shell: pwsh
    env:
      AVXVER: ${{ matrix.avx }}
      PCKGVER: ${{ inputs.version }}
    steps:
      - uses: actions/checkout@v4
        with:
          repository: 'oobabooga/text-generation-webui'
          ref: ${{ inputs.version }}
          submodules: 'recursive'
      - uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.pyver }}
      - name: Build Package
        shell: bash
        run: |
            rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
            # Define common variables
            CUDA_VERSION="${{ matrix.cuda }}"
            AVX_SUPPORT="${{ matrix.avx }}"
            VERSION="${{ inputs.version }}"
            # 1. Set platform-specific variables
            if [[ "$RUNNER_OS" == "Windows" ]]; then
                PLATFORM="windows"
                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-pc-windows-msvc-install_only.tar.gz"
                PIP_PATH="portable_env/python.exe -m pip"
                PACKAGES_PATH="portable_env/Lib/site-packages"
                ZIP_CMD="powershell -Command \"Compress-Archive -Path text-generation-webui -DestinationPath"
                rm start_linux.sh start_macos.sh
            else
                PLATFORM="linux"
                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-unknown-linux-gnu-install_only.tar.gz"
                PIP_PATH="portable_env/bin/python -m pip"
                PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
                ZIP_CMD="zip -r"
                rm start_macos.sh start_windows.bat
            fi
            # 2. Download and extract Python
            cd ..
            echo "Downloading Python for $PLATFORM..."
            curl -L -o python-build.tar.gz "$PYTHON_URL"
            tar -xzf python-build.tar.gz
            mv python text-generation-webui/portable_env
            # 3. Prepare requirements file based on AVX and CUDA
            if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
                BASE_REQ_FILE="requirements/portable/requirements.txt"
            else
                BASE_REQ_FILE="requirements/portable/requirements_noavx2.txt"
            fi
            # Create CUDA-specific requirements file if needed
            cd text-generation-webui
            if [[ "$CUDA_VERSION" == "11.7" ]]; then
                echo "Creating CUDA 11.7 specific requirements file"
                sed 's/cu124/cu117/g' "$BASE_REQ_FILE" > requirements_cuda_temp.txt
                REQ_FILE="requirements_cuda_temp.txt"
            else
                REQ_FILE="$BASE_REQ_FILE"
            fi
            # 4. Install packages
            echo "Installing Python packages from $REQ_FILE..."
            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
            # 5. Clean up
            if [[ "$CUDA_VERSION" == "11.7" ]]; then
                rm requirements_cuda_temp.txt
            fi
            # 6. Create ZIP file
            cd ..
            VERSION_CLEAN="${VERSION#v}"
            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}-cuda${CUDA_VERSION}.zip"
            echo "Creating archive: $ZIP_NAME"
            if [[ "$RUNNER_OS" == "Windows" ]]; then
                powershell -Command "Compress-Archive -Path text-generation-webui -DestinationPath $ZIP_NAME"
            else
                zip -r "$ZIP_NAME" text-generation-webui
            fi
      - name: Upload files to a GitHub release
        id: upload-release
        uses: svenstaro/upload-release-action@2.7.0
        continue-on-error: true
        with:
          repo_token: ${{ secrets.GITHUB_TOKEN }}
          file: ../textgen-portable-${{ inputs.version }}*.zip
          tag: ${{ inputs.version }}
          file_glob: true
          make_latest: false
          overwrite: true
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@ -0,0 +1,193 @@
 name: Build CPU and macOS
 on:
  workflow_dispatch:
    inputs:
      version:
        description: 'Version tag of text-generation-webui to build: v3.0'
        default: 'v3.0'
        required: true
        type: string
      config:
        description: 'Override configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
        default: 'Default'
        required: false
        type: string
      exclude:
        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
        default: 'None'
        required: false
        type: string
  workflow_call:
    inputs:
      version:
        description: 'Version tag of text-generation-webui to build: v3.0'
        default: 'v3.0'
        required: true
        type: string
      config:
        description: 'Configurations to build: key1:item1-1,item1-2;key2:item2-1,item2-2'
        default: 'Default'
        required: false
        type: string
      exclude:
        description: 'Exclude build configurations: key1-1:item1-1,key1-2:item1-2;key2-1:item2-1,key2-2:item2-2'
        default: 'None'
        required: false
        type: string
 permissions:
  contents: write
 jobs:
  define_matrix:
    name: Define Build Matrix
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    defaults:
      run:
        shell: pwsh
    env:
      CONFIGIN: ${{ inputs.config }}
      EXCLUDEIN: ${{ inputs.exclude }}
    steps:
      - name: Define Job Output
        id: set-matrix
        run: |
          $matrix = @{
              'os' = @('ubuntu-22.04', 'windows-2019', 'macos-13', 'macos-14')
              'pyver' = @("3.11")
              'avx' = @("AVX2")
          }
          if ($env:CONFIGIN -ne 'Default') {$env:CONFIGIN.split(';').foreach({$matrix[$_.split(':')[0]] = $_.split(':')[1].split(',')})}
          if ($env:EXCLUDEIN -ne 'None') {
              $exclusions = @()
              $exclusions += $env:EXCLUDEIN.split(';').replace(':','=').replace(',',"`n") | ConvertFrom-StringData
              $matrix['exclude'] = $exclusions
          }
          $matrixOut = ConvertTo-Json $matrix -Compress
          Write-Output ('matrix=' + $matrixOut) >> $env:GITHUB_OUTPUT
  build_wheels:
    name: ${{ matrix.os }} ${{ matrix.pyver }} CPU ${{ matrix.avx }}
    needs: define_matrix
    runs-on: ${{ matrix.os }}
    strategy:
      matrix: ${{ fromJSON(needs.define_matrix.outputs.matrix) }}
    defaults:
      run:
        shell: pwsh
    env:
      AVXVER: ${{ matrix.avx }}
      PCKGVER: ${{ inputs.version }}
    steps:
      - uses: actions/checkout@v4
        with:
          repository: 'oobabooga/text-generation-webui'
          ref: ${{ inputs.version }}
          submodules: 'recursive'
      - uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.pyver }}
      - name: Build Package
        shell: bash
        run: |
            rm -rf .git cmd* update_wizard* start_wsl.bat wsl.sh Colab-TextGen-GPU.ipynb docker
            # Define common variables
            AVX_SUPPORT="${{ matrix.avx }}"
            VERSION="${{ inputs.version }}"
            OS_TYPE="${{ matrix.os }}"
            # 1. Set platform-specific variables
            if [[ "$RUNNER_OS" == "Windows" ]]; then
                PLATFORM="windows-cpu"
                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-pc-windows-msvc-install_only.tar.gz"
                PIP_PATH="portable_env/python.exe -m pip"
                PACKAGES_PATH="portable_env/Lib/site-packages"
                rm start_linux.sh start_macos.sh
            elif [[ "$RUNNER_OS" == "macOS" ]]; then
                if [[ "$OS_TYPE" == "macos-13" ]]; then
                    PLATFORM="macos-x86_64"
                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-apple-darwin-install_only.tar.gz"
                    REQ_TYPE="apple_intel"
                else
                    PLATFORM="macos-arm64"
                    PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-aarch64-apple-darwin-install_only.tar.gz"
                    REQ_TYPE="apple_silicon"
                fi
                PIP_PATH="portable_env/bin/python -m pip"
                PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
                rm start_linux.sh start_windows.bat
            else
                # Linux case
                PLATFORM="linux-cpu"
                PYTHON_URL="https://github.com/astral-sh/python-build-standalone/releases/download/20250409/cpython-3.11.12+20250409-x86_64-unknown-linux-gnu-install_only.tar.gz"
                PIP_PATH="portable_env/bin/python -m pip"
                PACKAGES_PATH="portable_env/lib/python3.11/site-packages"
                rm start_macos.sh start_windows.bat
            fi
            # 2. Download and extract Python
            echo "Downloading Python for $PLATFORM..."
            cd ..
            curl -L -o python-build.tar.gz "$PYTHON_URL"
            tar -xzf python-build.tar.gz
            mv python text-generation-webui/portable_env
            # 3. Prepare requirements file based on platform and AVX
            cd text-generation-webui
            # Select requirements file based on platform
            if [[ "$RUNNER_OS" == "macOS" ]]; then
                if [[ "$OS_TYPE" == "macos-13" ]]; then
                    REQ_FILE="requirements/portable/requirements_apple_intel.txt"
                else
                    REQ_FILE="requirements/portable/requirements_apple_silicon.txt"
                fi
            else
                # For Windows and Linux, check AVX support
                if [[ "$AVX_SUPPORT" == "AVX2" ]]; then
                    REQ_FILE="requirements/portable/requirements_cpu_only.txt"
                else
                    REQ_FILE="requirements/portable/requirements_cpu_only_noavx2.txt"
                fi
            fi
            echo "Using requirements file: $REQ_FILE"
            # 4. Install packages
            echo "Installing Python packages from $REQ_FILE..."
            $PIP_PATH install --target="./$PACKAGES_PATH" -r "$REQ_FILE"
            # 5. Create ZIP file
            cd ..
            VERSION_CLEAN="${VERSION#v}"
            ZIP_NAME="textgen-portable-${VERSION_CLEAN}-${PLATFORM}.zip"
            echo "Creating archive: $ZIP_NAME"
            if [[ "$RUNNER_OS" == "Windows" ]]; then
                powershell -Command "Compress-Archive -Path text-generation-webui -DestinationPath $ZIP_NAME"
            else
                zip -r "$ZIP_NAME" text-generation-webui
            fi
      - name: Upload files to a GitHub release
        id: upload-release
        uses: svenstaro/upload-release-action@2.7.0
        continue-on-error: true
        with:
          repo_token: ${{ secrets.GITHUB_TOKEN }}
          file: ../textgen-portable-${{ inputs.version }}*.zip
          tag: ${{ inputs.version }}
          file_glob: true
          make_latest: false
          overwrite: true
--- a/README.md
+++ b/README.md
@ -27,6 +27,14 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 ## How to install
 #### Option 1: Portable builds
 Compatible with GGUF (llama.cpp) models, just unzip and run, no installation. Available for Windows, Linux, and macOS.
 Download from: https://github.com/oobabooga/text-generation-webui/releases
 #### Option 2: One-click installer
 1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip).
 2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat`.
 3) Select your GPU vendor when asked.
@ -352,6 +360,10 @@ Run `python download-model.py --help` to see all the options.
 https://colab.research.google.com/github/oobabooga/text-generation-webui/blob/main/Colab-TextGen-GPU.ipynb
 ## Community
 https://www.reddit.com/r/Oobabooga/
 ## Acknowledgment
 In August 2023, [Andreessen Horowitz](https://a16z.com/) (a16z) provided a generous grant to encourage and support my independent work on this project. I am **extremely** grateful for their trust and recognition.
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@ -7,10 +7,7 @@ from io import BytesIO
 import requests
 import tiktoken
 import torch
 import torch.nn.functional as F
 from PIL import Image
 from transformers import LogitsProcessor, LogitsProcessorList
 from extensions.openai.errors import InvalidRequestError
 from extensions.openai.utils import debug_msg
@ -22,54 +19,7 @@ from modules.chat import (
    load_instruction_template_memoized
 )
 from modules.presets import load_preset_memoized
-from modules.text_generation import (
+from modules.text_generation import decode, encode, generate_reply
    decode,
    encode,
    generate_reply,
    get_reply_from_output_ids
 )
 class LogitsBiasProcessor(LogitsProcessor):
    def __init__(self, logit_bias={}):
        self.logit_bias = logit_bias
        if self.logit_bias:
            self.keys = list([int(key) for key in self.logit_bias.keys()])
            values = [self.logit_bias[str(key)] for key in self.keys]
            self.values = torch.tensor(values, dtype=torch.float, device=shared.model.device)
            debug_msg(f"{self})")
    def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
        if self.logit_bias:
            debug_msg(logits[0, self.keys], " + ", self.values)
            logits[0, self.keys] += self.values
            debug_msg(" --> ", logits[0, self.keys])
            debug_msg(" max/min ", float(torch.max(logits[0])), float(torch.min(logits[0])))
        return logits
    def __repr__(self):
        return f"<{self.__class__.__name__}(logit_bias={self.logit_bias})>"
 class LogprobProcessor(LogitsProcessor):
    def __init__(self, logprobs=None):
        self.logprobs = logprobs
        self.token_alternatives = {}
    def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
        if self.logprobs is not None:  # 0-5
            log_e_probabilities = F.log_softmax(logits, dim=1)
            top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs + 1)
            top_tokens = [get_reply_from_output_ids([tok]) for tok in top_indices[0]]
            top_probs = [float(x) for x in top_values[0]]
            self.token_alternatives = dict(zip(top_tokens, top_probs))
            debug_msg(repr(self))
        return logits
    def __repr__(self):
        return f"<{self.__class__.__name__}(logprobs={self.logprobs}, token_alternatives={self.token_alternatives})>"
 def convert_logprobs_to_tiktoken(model, logprobs):
@ -107,21 +57,29 @@ def process_parameters(body, is_legacy=False):
        elif isinstance(body['stop'], list):
            generate_params['custom_stopping_strings'] = body['stop']
-    logits_processor = []
+    if shared.args.loader != 'llama.cpp':
-    logit_bias = body.get('logit_bias', None)
+        from transformers import LogitsProcessorList
    if logit_bias:  # {str: float, ...}
        logits_processor = [LogitsBiasProcessor(logit_bias)]
-    logprobs = None  # coming to chat eventually
+        from modules.transformers_loader import (
-    if 'logprobs' in body:
+            LogitsBiasProcessor,
-        logprobs = body.get('logprobs', 0)  # maybe cap at topk? don't clamp 0-5.
+            LogprobProcessor
-        generate_params['logprob_proc'] = LogprobProcessor(logprobs)
+        )
        logits_processor.extend([generate_params['logprob_proc']])
    else:
        logprobs = None
-    if logits_processor:  # requires logits_processor support
+        logits_processor = []
-        generate_params['logits_processor'] = LogitsProcessorList(logits_processor)
+        logit_bias = body.get('logit_bias', None)
        if logit_bias:  # {str: float, ...}
            logits_processor = [LogitsBiasProcessor(logit_bias)]
        logprobs = None  # coming to chat eventually
        if 'logprobs' in body:
            logprobs = body.get('logprobs', 0)  # maybe cap at topk? don't clamp 0-5.
            generate_params['logprob_proc'] = LogprobProcessor(logprobs)
            logits_processor.extend([generate_params['logprob_proc']])
        else:
            logprobs = None
        if logits_processor:  # requires logits_processor support
            generate_params['logits_processor'] = LogitsProcessorList(logits_processor)
    return generate_params
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@ -6,7 +6,6 @@ import traceback
 from collections import deque
 from threading import Thread
 import speech_recognition as sr
 import uvicorn
 from fastapi import Depends, FastAPI, Header, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
@ -16,11 +15,9 @@ from pydub import AudioSegment
 from sse_starlette import EventSourceResponse
 import extensions.openai.completions as OAIcompletions
 import extensions.openai.embeddings as OAIembeddings
 import extensions.openai.images as OAIimages
 import extensions.openai.logits as OAIlogits
 import extensions.openai.models as OAImodels
 import extensions.openai.moderations as OAImoderations
 from extensions.openai.errors import ServiceUnavailableError
 from extensions.openai.tokens import token_count, token_decode, token_encode
 from extensions.openai.utils import _start_cloudflared
@ -165,6 +162,8 @@ def handle_billing_usage():
@app.post('/v1/audio/transcriptions', dependencies=check_key)
 async def handle_audio_transcription(request: Request):
    import speech_recognition as sr
    r = sr.Recognizer()
    form = await request.form()
@ -211,6 +210,8 @@ async def handle_image_generation(request: Request):
@app.post("/v1/embeddings", response_model=EmbeddingsResponse, dependencies=check_key)
 async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
    import extensions.openai.embeddings as OAIembeddings
    input = request_data.input
    if not input:
        raise HTTPException(status_code=400, detail="Missing required argument input")
@ -224,6 +225,8 @@ async def handle_embeddings(request: Request, request_data: EmbeddingsRequest):
@app.post("/v1/moderations", dependencies=check_key)
 async def handle_moderations(request: Request):
    import extensions.openai.moderations as OAImoderations
    body = await request.json()
    input = body["input"]
    if not input:
--- a/modules/LoRA.py
+++ b/modules/LoRA.py
@ -2,7 +2,6 @@ from pathlib import Path
 import modules.shared as shared
 from modules.logging_colors import logger
 from modules.models import get_device
 def add_lora_to_model(lora_names):
@ -47,9 +46,10 @@ def add_lora_exllamav2(lora_names):
 def add_lora_transformers(lora_names):
    from peft import PeftModel
    from modules.torch_utils import get_device
    prior_set = set(shared.lora_names)
    added_set = set(lora_names) - prior_set
    removed_set = prior_set - set(lora_names)
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@ -2,9 +2,6 @@ import traceback
 from queue import Queue
 from threading import Thread
 import torch
 import transformers
 import modules.shared as shared
@ -12,25 +9,6 @@ class StopNowException(Exception):
    pass
 class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
    def __init__(self):
        transformers.StoppingCriteria.__init__(self)
    def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
        return shared.stop_everything
 class Stream(transformers.StoppingCriteria):
    def __init__(self, callback_func=None):
        self.callback_func = callback_func
    def __call__(self, input_ids, scores) -> bool:
        if self.callback_func is not None:
            self.callback_func(input_ids[0])
        return False
 class Iteratorize:
    """
--- a/modules/evaluate.py
+++ b/modules/evaluate.py
@ -2,13 +2,11 @@ import datetime
 from pathlib import Path
 import pandas as pd
 import torch
 from datasets import load_dataset
 from tqdm import tqdm
 from modules import shared
 from modules.logging_colors import logger
-from modules.models import clear_torch_cache, load_model, unload_model
+from modules.models import load_model, unload_model
 from modules.models_settings import get_model_metadata, update_model_parameters
 from modules.text_generation import encode
@ -39,6 +37,11 @@ def calculate_perplexity(models, input_dataset, stride, _max_length):
    https://huggingface.co/docs/transformers/perplexity#calculating-ppl-with-fixedlength-models
    '''
    import torch
    from datasets import load_dataset
    from modules.torch_utils import clear_torch_cache
    if shared.args.loader == "llama.cpp":
        logger.error("Perplexity evaluation is not implemented for the llama.cpp loader.")
        raise ValueError
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@ -4,10 +4,6 @@ from pathlib import Path
 from typing import Any, Dict, Optional, Union
 import torch
 from torch.nn import CrossEntropyLoss
 from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from exllamav2 import (
    ExLlamaV2,
    ExLlamaV2Cache,
@ -18,6 +14,15 @@ from exllamav2 import (
    ExLlamaV2Cache_TP,
    ExLlamaV2Config
 )
 from torch.nn import CrossEntropyLoss
 from transformers import (
    GenerationConfig,
    GenerationMixin,
    PretrainedConfig,
    PreTrainedModel
 )
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from modules import shared
 from modules.logging_colors import logger
@ -28,7 +33,7 @@ except Exception:
    traceback.print_exc()
-class Exllamav2HF(PreTrainedModel):
+class Exllamav2HF(PreTrainedModel, GenerationMixin):
    def __init__(self, config: ExLlamaV2Config):
        super().__init__(PretrainedConfig())
        self.ex_config = config
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@ -6,7 +6,12 @@ from typing import Any, Dict, Optional, Union
 import torch
 from exllamav3 import Cache, Config, Model
 from torch.nn import CrossEntropyLoss
-from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
+from transformers import (
    GenerationConfig,
    GenerationMixin,
    PretrainedConfig,
    PreTrainedModel
 )
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from modules import shared
@ -19,7 +24,7 @@ except Exception:
    traceback.print_exc()
-class Exllamav3HF(PreTrainedModel):
+class Exllamav3HF(PreTrainedModel, GenerationMixin):
    def __init__(self, model_dir):
        super().__init__(PretrainedConfig())
        self.generation_config = GenerationConfig()
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -1,4 +1,5 @@
 import json
 import os
 import pprint
 import socket
 import subprocess
@ -281,12 +282,21 @@ class LlamaServer:
        if shared.args.rope_freq_base > 0:
            cmd += ["--rope-freq-base", str(shared.args.rope_freq_base)]
        env = os.environ.copy()
        if os.name == 'posix':
            current_path = env.get('LD_LIBRARY_PATH', '')
            if current_path:
                env['LD_LIBRARY_PATH'] = f"{current_path}:{os.path.dirname(self.server_path)}"
            else:
                env['LD_LIBRARY_PATH'] = os.path.dirname(self.server_path)
        # Start the server with pipes for output
        self.process = subprocess.Popen(
            cmd,
            stderr=subprocess.PIPE,
            text=True,
-            bufsize=1
+            bufsize=1,
            env=env
        )
        def filter_stderr(process_stderr):
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -3,29 +3,7 @@ from collections import OrderedDict
 import gradio as gr
 from modules import shared
 loaders_and_params = OrderedDict({
    'Transformers': [
        'gpu_memory',
        'cpu_memory',
        'alpha_value',
        'compress_pos_emb',
        'compute_dtype',
        'quant_type',
        'load_in_8bit',
        'load_in_4bit',
        'torch_compile',
        'use_flash_attention_2',
        'auto_devices',
        'cpu',
        'disk',
        'use_double_quant',
        'use_eager_attention',
        'bf16',
        'trust_remote_code',
        'no_use_fast',
    ],
    'llama.cpp': [
        'n_gpu_layers',
        'threads',
@ -43,6 +21,25 @@ loaders_and_params = OrderedDict({
        'mlock',
        'numa',
    ],
    'Transformers': [
        'gpu_split',
        'cpu_memory',
        'alpha_value',
        'compress_pos_emb',
        'compute_dtype',
        'quant_type',
        'load_in_8bit',
        'load_in_4bit',
        'torch_compile',
        'use_flash_attention_2',
        'cpu',
        'disk',
        'use_double_quant',
        'use_eager_attention',
        'bf16',
        'trust_remote_code',
        'no_use_fast',
    ],
    'ExLlamav3_HF': [
        'max_seq_len',
        'gpu_split',
@ -346,10 +343,6 @@ def blacklist_samplers(loader, dynamic_temperature):
    return output
 def get_gpu_memory_keys():
    return [k for k in shared.gradio if k.startswith('gpu_memory')]
@functools.cache
 def get_all_params():
    all_params = set()
@ -357,11 +350,6 @@ def get_all_params():
        for el in loaders_and_params[k]:
            all_params.add(el)
    if 'gpu_memory' in all_params:
        all_params.remove('gpu_memory')
        for k in get_gpu_memory_keys():
            all_params.add(k)
    return sorted(all_params)
@ -371,8 +359,4 @@ def make_loader_params_visible(loader):
    if loader in loaders_and_params:
        params = loaders_and_params[loader]
        if 'gpu_memory' in params:
            params.remove('gpu_memory')
            params += get_gpu_memory_keys()
    return [gr.update(visible=True) if k in params else gr.update(visible=False) for k in all_params]
--- a/modules/logits.py
+++ b/modules/logits.py
@ -2,11 +2,10 @@ import time
 import traceback
 import numpy as np
 import torch
-from modules import models, sampler_hijack, shared
+from modules import models, shared
 from modules.logging_colors import logger
-from modules.models import get_device, load_model
+from modules.models import load_model
 from modules.text_generation import generate_reply
 global_scores = None
@ -38,18 +37,16 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
        logger.error("No model is loaded! Select one in the Model tab.")
        return 'Error: No model is loaded1 Select one in the Model tab.', previous
-    is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
+    # llama.cpp case
-    is_llamacpp = shared.model.__class__.__name__ == 'LlamaServer'
+    if shared.model.__class__.__name__ == 'LlamaServer':
    if is_llamacpp:
        logprobs = shared.model.get_logits(prompt, state, n_probs=top_logits, use_samplers=use_samplers)
        if return_dict:
            output = {}
            for entry in logprobs:
                token = repr(entry['token'])
                prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
                output[token] = prob
            return output
        else:
            output = ''
@ -57,9 +54,17 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
                token = repr(entry['token'])
                prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
                output += f"{prob:.5f}  -  {token}\n"
            return output, previous
    # All other model types
    else:
        import torch
        from modules import sampler_hijack
        from modules.torch_utils import get_device
        is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
        if not use_samplers:
            state = {'stream': True}
--- a/modules/models.py
+++ b/modules/models.py
@ -1,61 +1,11 @@
-import gc
+import sys
 import os
 import pprint
 import re
 import time
 from pathlib import Path
 import torch
 import transformers
 from accelerate import infer_auto_device_map, init_empty_weights
 from accelerate.utils import (
    is_ccl_available,
    is_npu_available,
    is_xpu_available
 )
 from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    is_torch_npu_available,
    is_torch_xpu_available
 )
 import modules.shared as shared
 from modules.logging_colors import logger
 from modules.models_settings import get_model_metadata
 transformers.logging.set_verbosity_error()
 local_rank = None
 if shared.args.deepspeed:
    import deepspeed
    from transformers.integrations.deepspeed import (
        HfDeepSpeedConfig,
        is_deepspeed_zero3_enabled
    )
    from modules.deepspeed_parameters import generate_ds_config
    # Distributed setup
    local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
    world_size = int(os.getenv("WORLD_SIZE", "1"))
    if is_xpu_available() and is_ccl_available():
        torch.xpu.set_device(local_rank)
        deepspeed.init_distributed(backend="ccl")
    elif is_npu_available():
        torch.npu.set_device(local_rank)
        deepspeed.init_distributed(dist_backend="hccl")
    else:
        torch.cuda.set_device(local_rank)
        deepspeed.init_distributed()
    ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
    dschf = HfDeepSpeedConfig(ds_config)  # Keep this object alive for the Transformers integration
 last_generation_time = time.time()
@ -66,8 +16,8 @@ def load_model(model_name, loader=None):
    shared.is_seq2seq = False
    shared.model_name = model_name
    load_func_map = {
        'Transformers': huggingface_loader,
        'llama.cpp': llama_cpp_server_loader,
        'Transformers': transformers_loader,
        'ExLlamav3_HF': ExLlamav3_HF_loader,
        'ExLlamav2_HF': ExLlamav2_HF_loader,
        'ExLlamav2': ExLlamav2_loader,
@ -85,8 +35,11 @@ def load_model(model_name, loader=None):
                logger.error('The path to the model does not exist. Exiting.')
                raise ValueError
    if loader != 'llama.cpp' and 'sampler_hijack' not in sys.modules:
        from modules import sampler_hijack
        sampler_hijack.hijack_samplers()
    shared.args.loader = loader
    clear_torch_cache()
    output = load_func_map[loader](model_name)
    if type(output) is tuple:
        model, tokenizer = output
@ -95,6 +48,7 @@ def load_model(model_name, loader=None):
        if model is None:
            return None, None
        else:
            from modules.transformers_loader import load_tokenizer
            tokenizer = load_tokenizer(model_name)
    shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
@ -110,163 +64,6 @@ def load_model(model_name, loader=None):
    return model, tokenizer
 def load_tokenizer(model_name, tokenizer_dir=None):
    if tokenizer_dir:
        path_to_model = Path(tokenizer_dir)
    else:
        path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
    tokenizer = None
    if path_to_model.exists():
        if shared.args.no_use_fast:
            logger.info('Loading the tokenizer with use_fast=False.')
        tokenizer = AutoTokenizer.from_pretrained(
            path_to_model,
            trust_remote_code=shared.args.trust_remote_code,
            use_fast=not shared.args.no_use_fast
        )
    return tokenizer
 def huggingface_loader(model_name):
    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
    params = {
        'low_cpu_mem_usage': True,
        'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
    }
    if shared.args.trust_remote_code:
        params['trust_remote_code'] = True
    if shared.args.use_flash_attention_2:
        params['use_flash_attention_2'] = True
    if shared.args.force_safetensors:
        params['force_safetensors'] = True
    if shared.args.use_eager_attention:
        params['attn_implementation'] = 'eager'
    config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
    if 'chatglm' in model_name.lower():
        LoaderClass = AutoModel
    else:
        if config.to_dict().get('is_encoder_decoder', False):
            LoaderClass = AutoModelForSeq2SeqLM
            shared.is_seq2seq = True
        else:
            LoaderClass = AutoModelForCausalLM
    # Determine if we should use default loading
    should_use_default_loading = not any([
        shared.args.cpu,
        shared.args.load_in_8bit,
        shared.args.load_in_4bit,
        shared.args.auto_devices,
        shared.args.disk,
        shared.args.deepspeed,
        shared.args.gpu_memory is not None,
        shared.args.cpu_memory is not None,
        shared.args.compress_pos_emb > 1,
        shared.args.alpha_value > 1,
    ])
    # Load the model without any special settings
    if should_use_default_loading:
        logger.info("TRANSFORMERS_PARAMS=")
        pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
        print()
        model = LoaderClass.from_pretrained(path_to_model, **params)
        if not (hasattr(model, 'is_loaded_in_4bit') and model.is_loaded_in_4bit):
            device = get_device()
            if device:
                model = model.to(device)
    # DeepSpeed ZeRO-3
    elif shared.args.deepspeed:
        model = LoaderClass.from_pretrained(
            path_to_model,
            torch_dtype=params['torch_dtype'],
            trust_remote_code=params.get('trust_remote_code')
        )
        model = deepspeed.initialize(
            model=model,
            config_params=ds_config,
            model_parameters=None,
            optimizer=None,
            lr_scheduler=None
        )[0]
        model.module.eval()  # Inference
        logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')
    # Load with quantization and/or offloading
    else:
        if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())):
            logger.warning('torch.cuda.is_available() and is_xpu_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.')
            shared.args.cpu = True
        if shared.args.cpu:
            params['torch_dtype'] = torch.float32
        else:
            params['device_map'] = 'auto'
            if x := get_max_memory_dict():
                params['max_memory'] = x
            if shared.args.load_in_4bit:
                # See https://github.com/huggingface/transformers/pull/23479/files
                # and https://huggingface.co/blog/4bit-transformers-bitsandbytes
                quantization_config_params = {
                    'load_in_4bit': True,
                    'bnb_4bit_compute_dtype': eval(f"torch.{shared.args.compute_dtype}") if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
                    'bnb_4bit_quant_type': shared.args.quant_type,
                    'bnb_4bit_use_double_quant': shared.args.use_double_quant,
                    'llm_int8_enable_fp32_cpu_offload': True
                }
                params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)
            elif shared.args.load_in_8bit:
                if shared.args.auto_devices or shared.args.gpu_memory:
                    params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
                else:
                    params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True)
                if params.get('max_memory') is not None:
                    with init_empty_weights():
                        model = LoaderClass.from_config(config, trust_remote_code=params.get('trust_remote_code'))
                    model.tie_weights()
                    params['device_map'] = infer_auto_device_map(
                        model,
                        dtype=torch.int8,
                        max_memory=params.get('max_memory'),
                        no_split_module_classes=model._no_split_modules
                    )
            if shared.args.disk:
                params['offload_folder'] = shared.args.disk_cache_dir
        if shared.args.compress_pos_emb > 1:
            params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
        elif shared.args.alpha_value > 1:
            params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
        logger.info("TRANSFORMERS_PARAMS=")
        pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
        print()
        model = LoaderClass.from_pretrained(path_to_model, **params)
    if shared.args.torch_compile:
        model = torch.compile(model)
    return model
 def llama_cpp_server_loader(model_name):
    from modules.llama_cpp_server import LlamaServer
@ -284,6 +81,11 @@ def llama_cpp_server_loader(model_name):
        logger.error(f"Error loading the model with llama.cpp: {str(e)}")
 def transformers_loader(model_name):
    from modules.transformers_loader import load_model_HF
    return load_model_HF(model_name)
 def ExLlamav3_HF_loader(model_name):
    from modules.exllamav3_hf import Exllamav3HF
@ -328,71 +130,18 @@ def TensorRT_LLM_loader(model_name):
    return model
 def get_max_memory_dict():
    max_memory = {}
    max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB'
    if shared.args.gpu_memory:
        memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory))
        for i in range(len(memory_map)):
            max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i]
        max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
    # If --auto-devices is provided standalone, try to get a reasonable value
    # for the maximum memory of device :0
    elif shared.args.auto_devices:
        if is_xpu_available():
            total_mem = (torch.xpu.get_device_properties(0).total_memory / (1024 * 1024))
        else:
            total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024))
        suggestion = round((total_mem - 1000) / 1000) * 1000
        if total_mem - suggestion < 800:
            suggestion -= 1000
        suggestion = int(round(suggestion / 1000))
        logger.warning(f"Auto-assiging --gpu-memory {suggestion} for your GPU to try to prevent out-of-memory errors. You can manually set other values.")
        max_memory[0] = f'{suggestion}GiB'
        max_memory['cpu'] = f'{max_cpu_memory}GiB' if not re.match('.*ib$', max_cpu_memory.lower()) else max_cpu_memory
    return max_memory if len(max_memory) > 0 else None
 def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    elif shared.args.deepspeed:
        import deepspeed
        return deepspeed.get_accelerator().current_device_name()
    elif torch.backends.mps.is_available():
        return torch.device('mps')
    elif is_torch_xpu_available():
        return torch.device('xpu:0')
    elif is_torch_npu_available():
        return torch.device('npu:0')
    else:
        return None
 def clear_torch_cache():
    gc.collect()
    if not shared.args.cpu:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        elif is_xpu_available():
            torch.xpu.empty_cache()
        elif is_npu_available():
            torch.npu.empty_cache()
        elif torch.backends.mps.is_available():
            if hasattr(torch.backends.mps, 'empty_cache'):
                torch.backends.mps.empty_cache()
 def unload_model(keep_model_name=False):
    if shared.model is None:
        return
    is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
    shared.model = shared.tokenizer = None
    shared.lora_names = []
    shared.model_dirty_from_training = False
-    clear_torch_cache()
+    if not is_llamacpp:
        from modules.torch_utils import clear_torch_cache
        clear_torch_cache()
    if not keep_model_name:
        shared.model_name = 'None'
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -188,41 +188,20 @@ def update_model_parameters(state, initial=False):
    UI: update the command-line arguments based on the interface values
    '''
    elements = ui.list_model_elements()  # the names of the parameters
    gpu_memories = []
    for i, element in enumerate(elements):
        if element not in state:
            continue
        value = state[element]
        if element.startswith('gpu_memory'):
            gpu_memories.append(value)
            continue
        if initial and element in shared.provided_arguments:
            continue
-        if element in ['cpu_memory'] and value == 0:
+        if element == 'cpu_memory' and value == 0:
            value = vars(shared.args_defaults)[element]
        # Making some simple conversions
        if element == 'cpu_memory' and value is not None:
            value = f"{value}MiB"
        setattr(shared.args, element, value)
    found_positive = False
    for i in gpu_memories:
        if i > 0:
            found_positive = True
            break
    if not (initial and vars(shared.args)['gpu_memory'] != vars(shared.args_defaults)['gpu_memory']):
        if found_positive:
            shared.args.gpu_memory = [f"{i}MiB" for i in gpu_memories]
        else:
            shared.args.gpu_memory = None
 def apply_model_settings_to_state(model, state):
    '''
--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@ -13,7 +13,10 @@ from transformers.generation.logits_process import (
 from modules import shared
 from modules.logging_colors import logger
-from modules.models import get_device
+from modules.torch_utils import get_device
 original_init = transformers.GenerationConfig.__init__
 original_get_logits_processor = transformers.GenerationMixin._get_logits_processor
 global_scores = None
@ -484,7 +487,7 @@ def get_logits_processor_patch(self, **kwargs):
        generation_config.temperature = float(generation_config.temperature)  # Must be float
    # Get the original warpers
-    warpers = self._get_logits_processor_old(**kwargs)
+    warpers = original_get_logits_processor(self, **kwargs)
    for i in range(len(warpers) - 1, -1, -1):
        # Replace temperature with our modified class.
@ -674,7 +677,7 @@ def get_logits_processor_patch(self, **kwargs):
 def generation_config_init_patch(self, **kwargs):
-    self.__init___old(**kwargs)
+    original_init(self, **kwargs)
    self.min_p = kwargs.pop("min_p", 0.0)
    self.dynamic_temperature = kwargs.pop("dynamic_temperature", False)
    self.dynatemp_low = kwargs.pop("dynatemp_low", 1)
@ -702,8 +705,5 @@ def generation_config_init_patch(self, **kwargs):
 def hijack_samplers():
    transformers.GenerationMixin._get_logits_processor_old = transformers.GenerationMixin._get_logits_processor
    transformers.GenerationMixin._get_logits_processor = get_logits_processor_patch
    transformers.GenerationConfig.__init___old = transformers.GenerationConfig.__init__
    transformers.GenerationConfig.__init__ = generation_config_init_patch
--- a/modules/shared.py
+++ b/modules/shared.py
@ -79,6 +79,7 @@ group.add_argument('--model', type=str, help='Name of the model to load by defau
 group.add_argument('--lora', type=str, nargs='+', help='The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.')
 group.add_argument('--model-dir', type=str, default='models/', help='Path to directory with all the models.')
 group.add_argument('--lora-dir', type=str, default='loras/', help='Path to directory with all the loras.')
 group.add_argument('--model-menu', action='store_true', help='Show a model menu in the terminal when the web UI is first launched.')
 group.add_argument('--settings', type=str, help='Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this file will be loaded by default without the need to use the --settings flag.')
 group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
 group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
@ -91,9 +92,7 @@ group.add_argument('--loader', type=str, help='Choose the model loader manually,
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
 group.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
-group.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.')
+group.add_argument('--cpu-memory', type=float, default=0, help='Maximum CPU memory in GiB. Use this for CPU offloading.')
 group.add_argument('--gpu-memory', type=str, nargs='+', help='Maximum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.')
 group.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.')
 group.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.')
 group.add_argument('--disk-cache-dir', type=str, default='cache', help='Directory to save the disk cache to. Defaults to "cache".')
 group.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision (using bitsandbytes).')
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -7,33 +7,18 @@ import time
 import traceback
 import numpy as np
 import torch
 import transformers
 from transformers import (
    LogitsProcessorList,
    is_torch_npu_available,
    is_torch_xpu_available
 )
 import modules.shared as shared
-from modules import models, sampler_hijack
+from modules import models
-from modules.callbacks import (
+from modules.callbacks import Iteratorize
    Iteratorize,
    Stream,
    _StopEverythingStoppingCriteria
 )
 from modules.extensions import apply_extensions
 from modules.grammar.grammar_utils import initialize_grammar
 from modules.grammar.logits_process import GrammarConstrainedLogitsProcessor
 from modules.html_generator import generate_basic_html
 from modules.logging_colors import logger
 from modules.models import clear_torch_cache, get_device, load_model
 sampler_hijack.hijack_samplers()
 def generate_reply(*args, **kwargs):
    if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']:
        from modules.models import load_model
        shared.model, shared.tokenizer = load_model(shared.model_name)
    shared.generation_lock.acquire()
@ -46,7 +31,6 @@ def generate_reply(*args, **kwargs):
 def _generate_reply(question, state, stopping_strings=None, is_chat=False, escape_html=False, for_ui=False):
    # Find the appropriate generation function
    generate_func = apply_extensions('custom_generate_reply')
    if generate_func is None:
@ -80,7 +64,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
            all_stop_strings += st
    shared.stop_everything = False
    seed = set_manual_seed(state['seed'])
    last_update = -1
    reply = ''
    is_stream = state['stream']
@ -93,7 +76,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
        min_update_interval = 1 / state['max_updates_second']
    # Generate
-    for reply in generate_func(question, original_question, seed, state, stopping_strings, is_chat=is_chat):
+    for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat):
        reply, stop_found = apply_stopping_strings(reply, all_stop_strings)
        if escape_html:
            reply = html.escape(reply)
@ -132,44 +115,55 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
    if shared.tokenizer is None:
        raise ValueError('No tokenizer is loaded')
-    if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel']:
+    # llama.cpp case
-        if shared.model.__class__.__name__ == 'LlamaServer':
+    if shared.model.__class__.__name__ == 'LlamaServer':
-            input_ids = shared.tokenizer.encode(str(prompt), add_bos_token=add_bos_token)
+        input_ids = shared.tokenizer.encode(str(prompt), add_bos_token=add_bos_token)
-        else:
+        input_ids = np.array(input_ids).reshape(1, len(input_ids))
        if truncation_length is not None:
            input_ids = input_ids[:, -truncation_length:]
        return input_ids
    # All other model types
    else:
        import torch
        from modules.torch_utils import get_device
        if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel']:
            input_ids = shared.tokenizer.encode(str(prompt))
            if shared.model.__class__.__name__ != 'Exllamav2Model':
                input_ids = np.array(input_ids).reshape(1, len(input_ids))
        else:
            input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
-        if shared.model.__class__.__name__ not in ['Exllamav2Model']:
+            if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None:
-            input_ids = np.array(input_ids).reshape(1, len(input_ids))
+                if add_bos_token:
-    else:
+                    # Add BOS token if missing
-        input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens)
+                    if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0:
                        bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]])
                        input_ids = torch.cat((bos_tensor, input_ids), 1)
-        if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None:
+                    # Prevent double BOS tokens from jinja templates
-            if add_bos_token:
+                    while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id:
-                if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0:
+                        input_ids = input_ids[:, 1:]
-                    # Add a missing bos token (it may not have been added due to faulty model metadata)
+                else:
-                    bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]])
+                    # Remove BOS tokens when not wanted
-                    input_ids = torch.cat((bos_tensor, input_ids), 1)
+                    while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id:
                        input_ids = input_ids[:, 1:]
-                # Prevent double bos token due to jinja templates with <s> somewhere
+        if truncation_length is not None:
-                while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id:
+            input_ids = input_ids[:, -truncation_length:]
                    input_ids = input_ids[:, 1:]
            else:
                # Remove any bos token that may have been added
                while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id:
                    input_ids = input_ids[:, 1:]
-    # Handling truncation
+        if shared.model.__class__.__name__ in ['Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
-    if truncation_length is not None:
+            return input_ids
-        input_ids = input_ids[:, -truncation_length:]
+        else:
            device = get_device()
            if device:
                return input_ids.to(device)
-    if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
+            return input_ids
        return input_ids
    else:
        device = get_device()
        if device:
            return input_ids.to(device)
        return input_ids
 def decode(output_ids, skip_special_tokens=True):
@ -225,13 +219,17 @@ def set_manual_seed(seed):
    if seed == -1:
        seed = random.randint(1, 2**31)
-    torch.manual_seed(seed)
+    if shared.args.loader != 'llama.cpp':
-    if torch.cuda.is_available():
+        import torch
-        torch.cuda.manual_seed_all(seed)
+        from transformers import is_torch_npu_available, is_torch_xpu_available
-    elif is_torch_xpu_available():
+
-        torch.xpu.manual_seed_all(seed)
+        torch.manual_seed(seed)
-    elif is_torch_npu_available():
+        if torch.cuda.is_available():
-        torch.npu.manual_seed_all(seed)
+            torch.cuda.manual_seed_all(seed)
        elif is_torch_xpu_available():
            torch.xpu.manual_seed_all(seed)
        elif is_torch_npu_available():
            torch.npu.manual_seed_all(seed)
    return seed
@ -285,10 +283,26 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0):
    return reply
-def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False):
+def generate_reply_HF(question, original_question, state, stopping_strings=None, is_chat=False):
    import torch
    import transformers
    from transformers import LogitsProcessorList
    from modules.grammar.grammar_utils import initialize_grammar
    from modules.grammar.logits_process import (
        GrammarConstrainedLogitsProcessor
    )
    from modules.torch_utils import clear_torch_cache, get_device
    from modules.transformers_loader import (
        Stream,
        _StopEverythingStoppingCriteria
    )
    if shared.args.loader == 'Transformers':
        clear_torch_cache()
    seed = set_manual_seed(state['seed'])
    generate_params = {}
    for k in [
        'temperature',
@ -458,12 +472,12 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
        return
-def generate_reply_custom(question, original_question, seed, state, stopping_strings=None, is_chat=False):
+def generate_reply_custom(question, original_question, state, stopping_strings=None, is_chat=False):
    """
    For models that do not use the transformers library for sampling
    """
    seed = set_manual_seed(state['seed'])
    seed = set_manual_seed(state['seed'])
    t0 = time.time()
    reply = ''
    try:
--- a/modules/torch_utils.py
+++ b/modules/torch_utils.py
@ -0,0 +1,37 @@
 import gc
 import torch
 from accelerate.utils import is_npu_available, is_xpu_available
 from transformers import is_torch_npu_available, is_torch_xpu_available
 from modules import shared
 def get_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    elif shared.args.deepspeed:
        import deepspeed
        return deepspeed.get_accelerator().current_device_name()
    elif torch.backends.mps.is_available():
        return torch.device('mps')
    elif is_torch_xpu_available():
        return torch.device('xpu:0')
    elif is_torch_npu_available():
        return torch.device('npu:0')
    else:
        return None
 def clear_torch_cache():
    gc.collect()
    if not shared.args.cpu:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        elif is_xpu_available():
            torch.xpu.empty_cache()
        elif is_npu_available():
            torch.npu.empty_cache()
        elif torch.backends.mps.is_available():
            if hasattr(torch.backends.mps, 'empty_cache'):
                torch.backends.mps.empty_cache()
--- a/modules/training.py
+++ b/modules/training.py
@ -15,13 +15,6 @@ from datetime import datetime
 from pathlib import Path
 import gradio as gr
 import torch
 import transformers
 from datasets import Dataset, load_dataset
 from transformers import is_torch_xpu_available
 from transformers.models.auto.modeling_auto import (
    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 )
 from modules import shared, ui, utils
 from modules.evaluate import (
@ -33,7 +26,6 @@ from modules.logging_colors import logger
 from modules.models import reload_model
 from modules.utils import natural_keys
 MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()}
 PARAMETERS = ["lora_name", "always_override", "q_proj_en", "v_proj_en", "k_proj_en", "o_proj_en", "gate_proj_en", "down_proj_en", "up_proj_en", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after", "stop_at_loss", "add_eos_token", "min_chars", "report_to"]
 WANT_INTERRUPT = False
@ -284,6 +276,9 @@ def calc_trainable_parameters(model):
 def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str):
    import torch
    import transformers
    from datasets import Dataset, load_dataset
    from peft import (
        LoraConfig,
        get_peft_model,
@ -293,6 +288,12 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en:
    from peft.utils.other import \
        TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING as \
        model_to_lora_modules
    from transformers import is_torch_xpu_available
    from transformers.models.auto.modeling_auto import (
        MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
    )
    MODEL_CLASSES = {v[1]: v[0] for v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.items()}
    global WANT_INTERRUPT
    WANT_INTERRUPT = False
--- a/modules/transformers_loader.py
+++ b/modules/transformers_loader.py
@ -0,0 +1,279 @@
 import os
 import pprint
 from pathlib import Path
 import torch
 import torch.nn.functional as F
 import transformers
 from accelerate import infer_auto_device_map, init_empty_weights
 from accelerate.utils import (
    is_ccl_available,
    is_npu_available,
    is_xpu_available
 )
 from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    LogitsProcessor
 )
 import modules.shared as shared
 from modules.logging_colors import logger
 from modules.text_generation import get_reply_from_output_ids
 from modules.torch_utils import get_device
 transformers.logging.set_verbosity_error()
 local_rank = None
 if shared.args.deepspeed:
    import deepspeed
    from transformers.integrations.deepspeed import (
        HfDeepSpeedConfig,
        is_deepspeed_zero3_enabled
    )
    from modules.deepspeed_parameters import generate_ds_config
    # Distributed setup
    local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0"))
    world_size = int(os.getenv("WORLD_SIZE", "1"))
    if is_xpu_available() and is_ccl_available():
        torch.xpu.set_device(local_rank)
        deepspeed.init_distributed(backend="ccl")
    elif is_npu_available():
        torch.npu.set_device(local_rank)
        deepspeed.init_distributed(dist_backend="hccl")
    else:
        torch.cuda.set_device(local_rank)
        deepspeed.init_distributed()
    ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir)
    dschf = HfDeepSpeedConfig(ds_config)  # Keep this object alive for the Transformers integration
 class _StopEverythingStoppingCriteria(transformers.StoppingCriteria):
    def __init__(self):
        transformers.StoppingCriteria.__init__(self)
    def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool:
        return shared.stop_everything
 class Stream(transformers.StoppingCriteria):
    def __init__(self, callback_func=None):
        self.callback_func = callback_func
    def __call__(self, input_ids, scores) -> bool:
        if self.callback_func is not None:
            self.callback_func(input_ids[0])
        return False
 class LogitsBiasProcessor(LogitsProcessor):
    def __init__(self, logit_bias={}):
        self.logit_bias = logit_bias
        if self.logit_bias:
            self.keys = list([int(key) for key in self.logit_bias.keys()])
            values = [self.logit_bias[str(key)] for key in self.keys]
            self.values = torch.tensor(values, dtype=torch.float, device=shared.model.device)
    def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
        if self.logit_bias:
            logits[0, self.keys] += self.values
        return logits
    def __repr__(self):
        return f"<{self.__class__.__name__}(logit_bias={self.logit_bias})>"
 class LogprobProcessor(LogitsProcessor):
    def __init__(self, logprobs=None):
        self.logprobs = logprobs
        self.token_alternatives = {}
    def __call__(self, input_ids: torch.LongTensor, logits: torch.FloatTensor) -> torch.FloatTensor:
        if self.logprobs is not None:  # 0-5
            log_e_probabilities = F.log_softmax(logits, dim=1)
            top_values, top_indices = torch.topk(log_e_probabilities, k=self.logprobs + 1)
            top_tokens = [get_reply_from_output_ids([tok]) for tok in top_indices[0]]
            top_probs = [float(x) for x in top_values[0]]
            self.token_alternatives = dict(zip(top_tokens, top_probs))
        return logits
    def __repr__(self):
        return f"<{self.__class__.__name__}(logprobs={self.logprobs}, token_alternatives={self.token_alternatives})>"
 def load_tokenizer(model_name, tokenizer_dir=None):
    if tokenizer_dir:
        path_to_model = Path(tokenizer_dir)
    else:
        path_to_model = Path(f"{shared.args.model_dir}/{model_name}/")
    tokenizer = None
    if path_to_model.exists():
        if shared.args.no_use_fast:
            logger.info('Loading the tokenizer with use_fast=False.')
        tokenizer = AutoTokenizer.from_pretrained(
            path_to_model,
            trust_remote_code=shared.args.trust_remote_code,
            use_fast=not shared.args.no_use_fast
        )
    return tokenizer
 def load_model_HF(model_name):
    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
    params = {
        'low_cpu_mem_usage': True,
        'torch_dtype': torch.bfloat16 if shared.args.bf16 else torch.float16,
    }
    if shared.args.trust_remote_code:
        params['trust_remote_code'] = True
    if shared.args.use_flash_attention_2:
        params['use_flash_attention_2'] = True
    if shared.args.force_safetensors:
        params['force_safetensors'] = True
    if shared.args.use_eager_attention:
        params['attn_implementation'] = 'eager'
    config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
    if 'chatglm' in model_name.lower():
        LoaderClass = AutoModel
    else:
        if config.to_dict().get('is_encoder_decoder', False):
            LoaderClass = AutoModelForSeq2SeqLM
            shared.is_seq2seq = True
        else:
            LoaderClass = AutoModelForCausalLM
    # Determine if we should use default loading
    should_use_default_loading = not any([
        shared.args.cpu,
        shared.args.load_in_8bit,
        shared.args.load_in_4bit,
        shared.args.disk,
        shared.args.deepspeed,
        shared.args.cpu_memory is not None,
        shared.args.compress_pos_emb > 1,
        shared.args.alpha_value > 1,
    ])
    # Load the model without any special settings
    if should_use_default_loading:
        params['device_map'] = 'auto'
        logger.info("TRANSFORMERS_PARAMS=")
        pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
        print()
        model = LoaderClass.from_pretrained(path_to_model, **params)
        if not (hasattr(model, 'is_loaded_in_4bit') and model.is_loaded_in_4bit):
            device = get_device()
            if device:
                model = model.to(device)
    # DeepSpeed ZeRO-3
    elif shared.args.deepspeed:
        model = LoaderClass.from_pretrained(
            path_to_model,
            torch_dtype=params['torch_dtype'],
            trust_remote_code=params.get('trust_remote_code')
        )
        model = deepspeed.initialize(
            model=model,
            config_params=ds_config,
            model_parameters=None,
            optimizer=None,
            lr_scheduler=None
        )[0]
        model.module.eval()  # Inference
        logger.info(f'DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}')
    # Load with quantization and/or offloading
    else:
        if not any((shared.args.cpu, torch.cuda.is_available(), is_xpu_available(), torch.backends.mps.is_available())):
            logger.warning('torch.cuda.is_available() and is_xpu_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.')
            shared.args.cpu = True
        if shared.args.cpu:
            params['torch_dtype'] = torch.float32
        else:
            params['device_map'] = 'auto'
            if x := get_max_memory_dict():
                params['max_memory'] = x
            if shared.args.load_in_4bit:
                # See https://github.com/huggingface/transformers/pull/23479/files
                # and https://huggingface.co/blog/4bit-transformers-bitsandbytes
                quantization_config_params = {
                    'load_in_4bit': True,
                    'bnb_4bit_compute_dtype': eval(f"torch.{shared.args.compute_dtype}") if shared.args.compute_dtype in ["bfloat16", "float16", "float32"] else None,
                    'bnb_4bit_quant_type': shared.args.quant_type,
                    'bnb_4bit_use_double_quant': shared.args.use_double_quant,
                    'llm_int8_enable_fp32_cpu_offload': True
                }
                params['quantization_config'] = BitsAndBytesConfig(**quantization_config_params)
            elif shared.args.load_in_8bit:
                if shared.args.gpu_split:
                    params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True)
                else:
                    params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True)
                if params.get('max_memory') is not None:
                    with init_empty_weights():
                        model = LoaderClass.from_config(config, trust_remote_code=params.get('trust_remote_code'))
                    model.tie_weights()
                    params['device_map'] = infer_auto_device_map(
                        model,
                        dtype=torch.int8,
                        max_memory=params.get('max_memory'),
                        no_split_module_classes=model._no_split_modules
                    )
            if shared.args.disk:
                params['offload_folder'] = shared.args.disk_cache_dir
        if shared.args.compress_pos_emb > 1:
            params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb}
        elif shared.args.alpha_value > 1:
            params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value}
        logger.info("TRANSFORMERS_PARAMS=")
        pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params)
        print()
        model = LoaderClass.from_pretrained(path_to_model, **params)
    if shared.args.torch_compile:
        model = torch.compile(model)
    return model
 def get_max_memory_dict():
    max_memory = {}
    if shared.args.cpu_memory > 0:
        max_memory['cpu'] = f'{shared.args.cpu_memory}GiB'
    if shared.args.gpu_split:
        for i, memory in enumerate(shared.args.gpu_split.split(',')):
            max_memory[i] = f'{memory}GiB'
    return max_memory if len(max_memory) > 0 else None
--- a/modules/ui.py
+++ b/modules/ui.py
@ -2,9 +2,7 @@ import copy
 from pathlib import Path
 import gradio as gr
 import torch
 import yaml
 from transformers import is_torch_xpu_available
 import extensions
 from modules import shared
@ -128,7 +126,6 @@ def list_model_elements():
        'torch_compile',
        'flash_attn',
        'use_flash_attention_2',
        'auto_devices',
        'cpu',
        'disk',
        'row_split',
@ -150,13 +147,6 @@ def list_model_elements():
        'no_use_fast',
    ]
    if is_torch_xpu_available():
        for i in range(torch.xpu.device_count()):
            elements.append(f'gpu_memory_{i}')
    else:
        for i in range(torch.cuda.device_count()):
            elements.append(f'gpu_memory_{i}')
    return elements
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -1,14 +1,9 @@
 import importlib
 import math
 import re
 import traceback
 from functools import partial
 from pathlib import Path
 import gradio as gr
 import psutil
 import torch
 from transformers import is_torch_npu_available, is_torch_xpu_available
 from modules import loaders, shared, ui, utils
 from modules.logging_colors import logger
@ -27,35 +22,6 @@ from modules.utils import gradio
 def create_ui():
    mu = shared.args.multi_user
    # Finding the default values for the GPU and CPU memories
    total_mem = []
    if is_torch_xpu_available():
        for i in range(torch.xpu.device_count()):
            total_mem.append(math.floor(torch.xpu.get_device_properties(i).total_memory / (1024 * 1024)))
    elif is_torch_npu_available():
        for i in range(torch.npu.device_count()):
            total_mem.append(math.floor(torch.npu.get_device_properties(i).total_memory / (1024 * 1024)))
    else:
        for i in range(torch.cuda.device_count()):
            total_mem.append(math.floor(torch.cuda.get_device_properties(i).total_memory / (1024 * 1024)))
    default_gpu_mem = []
    if shared.args.gpu_memory is not None and len(shared.args.gpu_memory) > 0:
        for i in shared.args.gpu_memory:
            if 'mib' in i.lower():
                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)))
            else:
                default_gpu_mem.append(int(re.sub('[a-zA-Z ]', '', i)) * 1000)
    while len(default_gpu_mem) < len(total_mem):
        default_gpu_mem.append(0)
    total_cpu_mem = math.floor(psutil.virtual_memory().total / (1024 * 1024))
    if shared.args.cpu_memory is not None:
        default_cpu_mem = re.sub('[a-zA-Z ]', '', shared.args.cpu_memory)
    else:
        default_cpu_mem = 0
    with gr.Tab("Model", elem_id="model-tab"):
        with gr.Row():
            with gr.Column():
@ -80,10 +46,6 @@ def create_ui():
                with gr.Blocks():
                    with gr.Row():
                        with gr.Column():
                            for i in range(len(total_mem)):
                                shared.gradio[f'gpu_memory_{i}'] = gr.Slider(label=f"gpu-memory in MiB for device :{i}", maximum=total_mem[i], value=default_gpu_mem[i])
                            shared.gradio['cpu_memory'] = gr.Slider(label="cpu-memory in MiB", maximum=total_cpu_mem, value=default_cpu_mem)
                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
                            shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
                            shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
@ -94,6 +56,7 @@ def create_ui():
                            shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
                            shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                            shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                            shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
                            shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
                            shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
                            shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
@ -107,7 +70,6 @@ def create_ui():
                            shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                            shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
                            shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
                            shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
                            shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                            shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
--- a/one_click.py
+++ b/one_click.py
@ -15,7 +15,6 @@ import sys
 # os.environ["HSA_OVERRIDE_GFX_VERSION"] = '10.3.0'
 # os.environ["HCC_AMDGPU_TARGET"] = 'gfx1030'
 # Define the required versions
 TORCH_VERSION = "2.6.0"
 TORCHVISION_VERSION = "0.21.0"
@ -62,6 +61,19 @@ def is_x86_64():
    return platform.machine() == "x86_64"
 def is_installed():
    site_packages_path = None
    for sitedir in site.getsitepackages():
        if "site-packages" in sitedir and conda_env_path in sitedir:
            site_packages_path = sitedir
            break
    if site_packages_path:
        return os.path.isfile(os.path.join(site_packages_path, 'torch', '__init__.py'))
    else:
        return os.path.isdir(conda_env_path)
 def cpu_has_avx2():
    try:
        import cpuinfo
@ -104,44 +116,13 @@ def torch_version():
    return torver
-def update_pytorch_and_python():
+def get_current_commit():
-    print_big_message("Checking for PyTorch updates.")
+    result = run_cmd("git rev-parse HEAD", capture_output=True, environment=True)
-
+    return result.stdout.decode('utf-8').strip()
    # Update the Python version. Left here for future reference in case this becomes necessary.
    # print_big_message("Checking for PyTorch and Python updates.")
    # current_python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
    # if current_python_version != PYTHON_VERSION:
    #     run_cmd(f"conda install -y python={PYTHON_VERSION}", assert_success=True, environment=True)
    torver = torch_version()
    base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}"
    if "+cu" in torver:
        install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
    elif "+rocm" in torver:
        install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
    elif "+cpu" in torver:
        install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
    elif "+cxx11" in torver:
        intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
        install_cmd = f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
    else:
        install_cmd = base_cmd
    run_cmd(install_cmd, assert_success=True, environment=True)
-def is_installed():
+def get_extensions_names():
-    site_packages_path = None
+    return [foldername for foldername in os.listdir('extensions') if os.path.isfile(os.path.join('extensions', foldername, 'requirements.txt'))]
    for sitedir in site.getsitepackages():
        if "site-packages" in sitedir and conda_env_path in sitedir:
            site_packages_path = sitedir
            break
    if site_packages_path:
        return os.path.isfile(os.path.join(site_packages_path, 'torch', '__init__.py'))
    else:
        return os.path.isdir(conda_env_path)
 def check_env():
@ -157,35 +138,11 @@ def check_env():
        sys.exit(1)
 def get_current_commit():
    result = run_cmd("git rev-parse HEAD", capture_output=True, environment=True)
    return result.stdout.decode('utf-8').strip()
 def clear_cache():
    run_cmd("conda clean -a -y", environment=True)
    run_cmd("python -m pip cache purge", environment=True)
 def print_big_message(message):
    message = message.strip()
    lines = message.split('\n')
    print("\n\n*******************************************************************")
    for line in lines:
        print("*", line)
    print("*******************************************************************\n\n")
 def calculate_file_hash(file_path):
    p = os.path.join(script_dir, file_path)
    if os.path.isfile(p):
        with open(p, 'rb') as f:
            return hashlib.sha256(f.read()).hexdigest()
    else:
        return ''
 def run_cmd(cmd, assert_success=False, environment=False, capture_output=False, env=None):
    # Use the conda environment
    if environment:
@ -210,6 +167,25 @@ def run_cmd(cmd, assert_success=False, environment=False, capture_output=False,
    return result
 def print_big_message(message):
    message = message.strip()
    lines = message.split('\n')
    print("\n\n*******************************************************************")
    for line in lines:
        print("*", line)
    print("*******************************************************************\n\n")
 def calculate_file_hash(file_path):
    p = os.path.join(script_dir, file_path)
    if os.path.isfile(p):
        with open(p, 'rb') as f:
            return hashlib.sha256(f.read()).hexdigest()
    else:
        return ''
 def generate_alphabetic_sequence(index):
    result = ''
    while index >= 0:
@ -238,6 +214,51 @@ def get_user_choice(question, options_dict):
    return choice
 def update_pytorch_and_python():
    print_big_message("Checking for PyTorch updates.")
    # Update the Python version. Left here for future reference in case this becomes necessary.
    # print_big_message("Checking for PyTorch and Python updates.")
    # current_python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
    # if current_python_version != PYTHON_VERSION:
    #     run_cmd(f"conda install -y python={PYTHON_VERSION}", assert_success=True, environment=True)
    torver = torch_version()
    base_cmd = f"python -m pip install --upgrade torch=={TORCH_VERSION} torchvision=={TORCHVISION_VERSION} torchaudio=={TORCHAUDIO_VERSION}"
    if "+cu" in torver:
        install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
    elif "+rocm" in torver:
        install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
    elif "+cpu" in torver:
        install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
    elif "+cxx11" in torver:
        intel_extension = "intel-extension-for-pytorch==2.1.10+xpu" if is_linux() else "intel-extension-for-pytorch==2.1.10"
        install_cmd = f"{base_cmd} {intel_extension} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"
    else:
        install_cmd = base_cmd
    run_cmd(install_cmd, assert_success=True, environment=True)
 def clean_outdated_pytorch_cuda_dependencies():
    patterns = ["cu121", "cu122", "torch2.4"]
    result = run_cmd("python -m pip list --format=freeze", capture_output=True, environment=True)
    matching_packages = []
    for line in result.stdout.decode('utf-8').splitlines():
        if "==" in line:
            pkg_name, version = line.split('==', 1)
            if any(pattern in version for pattern in patterns):
                matching_packages.append(pkg_name)
    if matching_packages:
        print(f"\nUninstalling: {', '.join(matching_packages)}\n")
        run_cmd(f"python -m pip uninstall -y {' '.join(matching_packages)}", assert_success=True, environment=True)
    return matching_packages
 def install_webui():
    if os.path.isfile(state_file):
        os.remove(state_file)
@ -323,37 +344,6 @@ def install_webui():
    update_requirements(initial_installation=True, pull=False)
 def get_extensions_names():
    return [foldername for foldername in os.listdir('extensions') if os.path.isfile(os.path.join('extensions', foldername, 'requirements.txt'))]
 def install_extensions_requirements():
    print_big_message("Installing extensions requirements.\nSome of these may fail on Windows.\nDon\'t worry if you see error messages, as they will not affect the main program.")
    extensions = get_extensions_names()
    for i, extension in enumerate(extensions):
        print(f"\n\n--- [{i + 1}/{len(extensions)}]: {extension}\n\n")
        extension_req_path = os.path.join("extensions", extension, "requirements.txt")
        run_cmd(f"python -m pip install -r {extension_req_path} --upgrade", assert_success=False, environment=True)
 def clean_outdated_pytorch_cuda_dependencies():
    patterns = ["cu121", "cu122", "torch2.4"]
    result = run_cmd("python -m pip list --format=freeze", capture_output=True, environment=True)
    matching_packages = []
    for line in result.stdout.decode('utf-8').splitlines():
        if "==" in line:
            pkg_name, version = line.split('==', 1)
            if any(pattern in version for pattern in patterns):
                matching_packages.append(pkg_name)
    if matching_packages:
        print(f"\nUninstalling: {', '.join(matching_packages)}\n")
        run_cmd(f"python -m pip uninstall -y {' '.join(matching_packages)}", assert_success=True, environment=True)
    return matching_packages
 def update_requirements(initial_installation=False, pull=True):
    # Create .git directory if missing
    if not os.path.exists(os.path.join(script_dir, ".git")):
@ -366,14 +356,18 @@ def update_requirements(initial_installation=False, pull=True):
        )
    torver = torch_version()
    requirements_base = os.path.join("requirements", "full")
    if "+rocm" in torver:
-        requirements_file = "requirements_amd" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt"
+        file_name = f"requirements_amd{'_noavx2' if not cpu_has_avx2() else ''}.txt"
    elif "+cpu" in torver or "+cxx11" in torver:
-        requirements_file = "requirements_cpu_only" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt"
+        file_name = f"requirements_cpu_only{'_noavx2' if not cpu_has_avx2() else ''}.txt"
    elif is_macos():
-        requirements_file = "requirements_apple_" + ("intel" if is_x86_64() else "silicon") + ".txt"
+        file_name = f"requirements_apple_{'intel' if is_x86_64() else 'silicon'}.txt"
    else:
-        requirements_file = "requirements" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt"
+        file_name = f"requirements{'_noavx2' if not cpu_has_avx2() else ''}.txt"
    requirements_file = os.path.join(requirements_base, file_name)
    # Load state from JSON file
    current_commit = get_current_commit()
@ -475,6 +469,15 @@ def update_requirements(initial_installation=False, pull=True):
    clear_cache()
 def install_extensions_requirements():
    print_big_message("Installing extensions requirements.\nSome of these may fail on Windows.\nDon\'t worry if you see error messages, as they will not affect the main program.")
    extensions = get_extensions_names()
    for i, extension in enumerate(extensions):
        print(f"\n\n--- [{i + 1}/{len(extensions)}]: {extension}\n\n")
        extension_req_path = os.path.join("extensions", extension, "requirements.txt")
        run_cmd(f"python -m pip install -r {extension_req_path} --upgrade", assert_success=False, environment=True)
 def launch_webui():
    run_cmd(f"python server.py {flags}", environment=True)
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@ -7,7 +7,6 @@ fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
 pandas
 peft==0.15.*
@ -26,14 +25,13 @@ tqdm
 wandb
 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@ -6,7 +6,6 @@ fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
 pandas
 peft==0.15.*
@ -25,12 +24,11 @@ tqdm
 wandb
 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/rocm/llama_cpp_binaries-0.2.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@ -6,7 +6,6 @@ fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
 pandas
 peft==0.15.*
@ -25,12 +24,11 @@ tqdm
 wandb
 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/rocm/llama_cpp_binaries-0.2.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@ -6,7 +6,6 @@ fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
 pandas
 peft==0.15.*
@ -25,14 +24,12 @@ tqdm
 wandb
 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@ -6,7 +6,6 @@ fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
 pandas
 peft==0.15.*
@ -25,13 +24,13 @@ tqdm
 wandb
 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@ -6,7 +6,6 @@ fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
 pandas
 peft==0.15.*
@ -25,11 +24,10 @@ tqdm
 wandb
 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@ -6,7 +6,6 @@ fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
 pandas
 peft==0.15.*
@ -25,11 +24,10 @@ tqdm
 wandb
 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@ -7,7 +7,6 @@ fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
 pandas
 peft==0.15.*
@ -26,14 +25,13 @@ tqdm
 wandb
 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@ -6,7 +6,6 @@ fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
 pandas
 peft==0.15.*
@ -25,7 +24,6 @@ tqdm
 wandb
 # API
 SpeechRecognition==3.10.0
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@ -0,0 +1,19 @@
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
 pyyaml
 requests
 rich
 tqdm
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@ -0,0 +1,18 @@
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
 pyyaml
 requests
 rich
 tqdm
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
--- a/requirements/portable/requirements_amd_noavx2.txt
+++ b/requirements/portable/requirements_amd_noavx2.txt
@ -0,0 +1,18 @@
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
 pyyaml
 requests
 rich
 tqdm
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # AMD wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@ -0,0 +1,19 @@
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
 pyyaml
 requests
 rich
 tqdm
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@ -0,0 +1,20 @@
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
 pyyaml
 requests
 rich
 tqdm
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # Mac wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@ -0,0 +1,19 @@
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
 pyyaml
 requests
 rich
 tqdm
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # llama.cpp (CPU only, AVX2)
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@ -0,0 +1,19 @@
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
 pyyaml
 requests
 rich
 tqdm
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # llama.cpp (CPU only, no AVX2)
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@ -0,0 +1,19 @@
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
 pyyaml
 requests
 rich
 tqdm
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
 # CUDA wheels
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.3.0/llama_cpp_binaries-0.3.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@ -0,0 +1,15 @@
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
 pyyaml
 requests
 rich
 tqdm
 # API
 flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken
--- a/server.py
+++ b/server.py
@ -1,11 +1,8 @@
 import os
 import warnings
 from modules import shared
 import accelerate  # This early import makes Intel GPUs happy
 import modules.one_click_installer_check
 from modules import shared
 from modules.block_requests import OpenMonkeyPatch, RequestBlocker
 from modules.logging_colors import logger
@ -38,7 +35,6 @@ import yaml
 import modules.extensions as extensions_module
 from modules import (
    chat,
    training,
    ui,
    ui_chat,
@ -89,7 +85,7 @@ def create_interface():
    # Force some events to be triggered on page load
    shared.persistent_interface_state.update({
-        'loader': shared.args.loader or 'Transformers',
+        'loader': shared.args.loader or 'llama.cpp',
        'mode': shared.settings['mode'] if shared.settings['mode'] == 'instruct' else gr.update(),
        'character_menu': shared.args.character or shared.settings['character'],
        'instruction_template_str': shared.settings['instruction_template_str'],
@ -218,10 +214,28 @@ if __name__ == "__main__":
        if extension not in shared.args.extensions:
            shared.args.extensions.append(extension)
    available_models = utils.get_available_models()
    # Model defined through --model
    if shared.args.model is not None:
        shared.model_name = shared.args.model
    # Select the model from a command-line menu
    elif shared.args.model_menu:
        if len(available_models) == 0:
            logger.error('No models are available! Please download at least one.')
            sys.exit(0)
        else:
            print('The following models are available:\n')
            for i, model in enumerate(available_models):
                print(f'{i+1}. {model}')
            print(f'\nWhich one do you want to load? 1-{len(available_models)}\n')
            i = int(input()) - 1
            print()
        shared.model_name = available_models[i]
    # If any model has been selected, load it
    if shared.model_name != 'None':
        p = Path(shared.model_name)
--- a/start_linux.sh
+++ b/start_linux.sh
@ -2,6 +2,12 @@
 cd "$(dirname "${BASH_SOURCE[0]}")"
 # Portable install case
 if [ -d "portable_env" ]; then
    ./portable_env/bin/python3 server.py --api --auto-launch "$@"
    exit $?
 fi
 if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
 # deactivate existing conda envs as needed to avoid conflicts
--- a/start_macos.sh
+++ b/start_macos.sh
@ -2,6 +2,12 @@
 cd "$(dirname "${BASH_SOURCE[0]}")"
 # Portable install case
 if [ -d "portable_env" ]; then
    ./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@"
    exit $?
 fi
 if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
 # deactivate existing conda envs as needed to avoid conflicts
--- a/start_windows.bat
+++ b/start_windows.bat
@ -3,6 +3,12 @@ setlocal enabledelayedexpansion
 cd /D "%~dp0"
@rem Portable install case
 if exist "portable_env" (
    .\portable_env\python.exe server.py --api --auto-launch %*
    exit /b %errorlevel%
 )
 set PATH=%PATH%;%SystemRoot%\system32
 echo "%CD%"| findstr /C:" " >nul && echo This script relies on Miniconda which can not be silently installed under a path with spaces. && goto end