Merge pull request #6848 from oobabooga/dev

Merge dev branch
2025-06-07 06:06:20 -04:00 · 2025-04-18 13:36:06 -03:00 · 2025-04-18 13:36:06 -03:00 · c5e54c0b37
commit c5e54c0b37
parent 14e6baeb48 e52f62d3ff
42 changed files with 561 additions and 2113 deletions
--- a/Colab-TextGen-GPU.ipynb
+++ b/Colab-TextGen-GPU.ipynb
@ -67,7 +67,6 @@
        "\n",
        "  # Install the project in an isolated environment\n",
        "  !GPU_CHOICE=A \\\n",
-        "  USE_CUDA118=FALSE \\\n",
        "  LAUNCH_AFTER_INSTALL=FALSE \\\n",
        "  INSTALL_EXTENSIONS=FALSE \\\n",
        "  ./start_linux.sh\n",
--- a/README.md
+++ b/README.md
@ -126,17 +126,6 @@ Then browse to

 `http://localhost:7860/?__theme=dark`

-##### AMD GPU on Windows
-
-1) Use `requirements_cpu_only.txt` or `requirements_cpu_only_noavx2.txt` in the command above.
-
-2) Manually install llama-cpp-python using the appropriate command for your hardware: [Installation from PyPI](https://github.com/abetlen/llama-cpp-python#installation-with-hardware-acceleration).
-    * Use the `LLAMA_HIPBLAS=on` toggle.
-    * Note the [Windows remarks](https://github.com/abetlen/llama-cpp-python#windows-remarks).
-
-3) Manually install AutoGPTQ: [Installation](https://github.com/PanQiWei/AutoGPTQ#install-from-source).
-    * Perform the from-source installation - there are no prebuilt ROCm packages for Windows.
-
 ##### Manual install

 The `requirements*.txt` above contain various wheels precompiled through GitHub Actions. If you wish to compile things manually, or if you need to because no suitable wheels are available for your hardware, you can use `requirements_nowheels.txt` and then install your desired loaders manually.
@ -184,19 +173,17 @@ List of command-line flags
 </summary>

 ```txt
-usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--model-menu] [--settings SETTINGS]
+usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--lora LORA [LORA ...]] [--model-dir MODEL_DIR] [--lora-dir LORA_DIR] [--settings SETTINGS]
                 [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--auto-devices] [--gpu-memory GPU_MEMORY [GPU_MEMORY ...]]
                 [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR] [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast]
                 [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit] [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn]
-                 [--tensorcores] [--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--no_mul_mat_q] [--n_batch N_BATCH] [--no-mmap] [--mlock] [--n-gpu-layers N_GPU_LAYERS]
-                 [--tensor_split TENSOR_SPLIT] [--numa] [--logits_all] [--no_offload_kqv] [--cache-capacity CACHE_CAPACITY] [--row_split] [--streaming-llm] [--attention-sink-size ATTENTION_SINK_SIZE]
-                 [--tokenizer-dir TOKENIZER_DIR] [--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa]
+                 [--n_ctx N_CTX] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap] [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT]
+                 [--numa] [--no-kv-offload] [--row-split] [--gpu-split GPU_SPLIT] [--autosplit] [--max_seq_len MAX_SEQ_LEN] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa]
                 [--num_experts_per_token NUM_EXPERTS_PER_TOKEN] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner] [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
                 [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
                 [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
                 [--subpath SUBPATH] [--old-colors] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6]
-                 [--api-disable-ipv4] [--nowebui] [--multimodal-pipeline MULTIMODAL_PIPELINE] [--cache_4bit] [--cache_8bit] [--chat-buttons] [--triton] [--no_inject_fused_mlp] [--no_use_cuda_fp16]
-                 [--desc_act] [--disable_exllama] [--disable_exllamav2] [--wbits WBITS] [--groupsize GROUPSIZE]
+                 [--api-disable-ipv4] [--nowebui]

 Text generation web UI

@ -210,7 +197,6 @@ Basic settings:
  --lora LORA [LORA ...]                         The list of LoRAs to load. If you want to load more than one LoRA, write the names separated by spaces.
  --model-dir MODEL_DIR                          Path to directory with all the models.
  --lora-dir LORA_DIR                            Path to directory with all the loras.
-  --model-menu                                   Show a model menu in the terminal when the web UI is first launched.
  --settings SETTINGS                            Load the default interface settings from this yaml file. See settings-template.yaml for an example. If you create a file called settings.yaml, this
                                                 file will be loaded by default without the need to use the --settings flag.
  --extensions EXTENSIONS [EXTENSIONS ...]       The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.
@ -218,7 +204,7 @@ Basic settings:
  --idle-timeout IDLE_TIMEOUT                    Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.

 Model loader:
-  --loader LOADER                                Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2,
+  --loader LOADER                                Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2,
                                                 HQQ, TensorRT-LLM.

 Transformers/Accelerate:
@ -247,24 +233,17 @@ bitsandbytes 4-bit:

 llama.cpp:
  --flash-attn                                   Use flash-attention.
-  --tensorcores                                  NVIDIA only: use llama-cpp-python compiled without GGML_CUDA_FORCE_MMQ. This may improve performance on newer cards.
  --n_ctx N_CTX                                  Size of the prompt context.
  --threads THREADS                              Number of threads to use.
  --threads-batch THREADS_BATCH                  Number of threads to use for batches/prompt processing.
-  --no_mul_mat_q                                 Disable the mulmat kernels.
-  --n_batch N_BATCH                              Maximum number of prompt tokens to batch together when calling llama_eval.
+  --batch-size BATCH_SIZE                        Maximum number of prompt tokens to batch together when calling llama_eval.
  --no-mmap                                      Prevent mmap from being used.
  --mlock                                        Force the system to keep the model in RAM.
  --n-gpu-layers N_GPU_LAYERS                    Number of layers to offload to the GPU.
-  --tensor_split TENSOR_SPLIT                    Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
+  --tensor-split TENSOR_SPLIT                    Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
  --numa                                         Activate NUMA task allocation for llama.cpp.
-  --logits_all                                   Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.
-  --no_offload_kqv                               Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
-  --cache-capacity CACHE_CAPACITY                Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.
-  --row_split                                    Split the model by rows across GPUs. This may improve multi-gpu performance.
-  --streaming-llm                                Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
-  --attention-sink-size ATTENTION_SINK_SIZE      StreamingLLM: number of sink tokens. Only used if the trimmed prompt does not share a prefix with the old prompt.
-  --tokenizer-dir TOKENIZER_DIR                  Load the tokenizer from this folder. Meant to be used with llamacpp_HF through the command-line.
+  --no-kv-offload                                Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
+  --row-split                                    Split the model by rows across GPUs. This may improve multi-gpu performance.

 ExLlamaV2:
  --gpu-split GPU_SPLIT                          Comma-separated list of VRAM (in GB) to use per GPU device for model layers. Example: 20,7,7.
@ -319,9 +298,6 @@ API:
  --api-enable-ipv6                              Enable IPv6 for the API
  --api-disable-ipv4                             Disable IPv4 for the API
  --nowebui                                      Do not launch the Gradio UI. Useful for launching the API in standalone mode.
-
-Multimodal:
-  --multimodal-pipeline MULTIMODAL_PIPELINE      The multimodal pipeline to use. Examples: llava-7b, llava-13b.
 ```

 </details>
--- a/extensions/multimodal/DOCS.md
+++ b/extensions/multimodal/DOCS.md
@ -1,85 +0,0 @@
-# Technical description of multimodal extension
-
-## Working principle
-Multimodality extension does most of the stuff which is required for any image input:
-
- adds the UI
- saves the images as base64 JPEGs to history
- provides the hooks to the UI
- if there are images in the prompt, it:
-    - splits the prompt to text and image parts
-    - adds image start/end markers to text parts, then encodes and embeds the text parts
-    - calls the vision pipeline to embed the images
-    - stitches the embeddings together, and returns them to text generation
- loads the appropriate vision pipeline, selected either from model name, or by specifying --multimodal-pipeline parameter
-
-Now, for the pipelines, they:
-
- load the required vision models
- return some consts, for example the number of tokens taken up by image
- and most importantly: return the embeddings for LLM, given a list of images
-
-## Prompts/history
-
-To save images in prompt/history, this extension is using a base64 JPEG, wrapped in a HTML tag, like so:
-```
-<img src="data:image/jpeg;base64,{img_str}">
-```
-where `{img_str}` is the actual image data. This format makes displaying them in the UI for free. Do note, that this format is required to be exactly the same, the regex used to find the images is: `<img src="data:image/jpeg;base64,([A-Za-z0-9+/=]+)">`.
-
-## LLM input
-To describe the input, let's see it on an example prompt:
-```
-text1<image1>text2<image2>text3
-```
-where `textN` is N-th text, `<imageN>` is N-th image, in HTML format specified above.
-
-**The first step is to split the prompt into image/text parts**, so we get:
-```
-['text1', '<image1>', 'text2', '<image2>', 'text3']
-```
-this is done in `MultimodalEmbedder._split_prompt(...)` function, which returns a list of `PromptPart`s - dataclasses wrapping the separate parts.
-
-This function also appends the image start/end markers to text, which are provided by `AbstractMultimodalPipeline.image_start()` / `AbstractMultimodalPipeline.image_end()` functions. If image start is `<Img>`, and end is `</Img>`, this function will return:
-```
-['text1<Img>', '<image1>', '</Img>text2<Img>', '<image2>', '</Img>text3']
-```
-
-**The returned prompt parts are then turned into token embeddings.**
-
-First, they are modified to token IDs, for the text it is done using standard `modules.text_generation.encode()` function, and for the images the returned token IDs are changed to placeholders. The placeholder is a list of `N` times `placeholder token id`, where `N` is specified using `AbstractMultimodalPipeline.num_image_embeds()`, and placeholder token IDs using  `AbstractMultimodalPipeline.placeholder_token_id()`.
-
-Now, based on the token IDs, the prompt might get truncated, especially if `max_new_tokens` are unreasonably high. Unfortunately, it can't be done simply, just by trimming the prompt to be short enough. This way will lead to sometimes splitting the prompt in the middle of an image embedding, which usually breaks the generation. Therefore, in this case, the entire image needs to be removed from input. This is done inside `MultimodalEmbedder._encode_text(...)` function.
-
-**After the tokenization, the tokens need to get embedded**, the text and images are once again treated separately.
-
-The text parts are turned to embeddings, using `AbstractMultimodalPipeline.embed_tokens(...)` function. It uses standard embedding function from the model, but to support many LLMs, the actual function is returned by the pipeline (as it might be different for different LLMs), for LLaMA it is `shared.model.model.embed_tokens(...)`.
-
-The image parts are turned to embeddings, using `AbstractMultimodalPipeline.embed_images(...)` function. This function is specific for a given pipeline, it takes the images as input, forwards them through vision model/projector, and returns the embeddings.
-
-**Now, the returned embeddings are stitched together**, using `torch.cat()`, this is creating the final input to the LLM.
-
-## Pipelines
-
-All of the pipelines should subclass `AbstractMultimodalPipeline` class. The idea is to allow for new pipelines to be added in the same way as user extensions - git clone into `extensions/multimodal/pipelines`.
-
-The pipelines are the description of the vision part, containing vision model/multimodal projector. All of the pipelines should have an unique `name()`, which is then selected by user, in `--multimodal-pipeline` CLI argument. For an example, see `pipelines/llava/llava.py`.
-
-## Pipeline modules
-
-Pipelines are organized into "pipeline modules" - subdirectories in `pipelines` directory. The pipeline modules should contain a file called `pipelines.py`, that should contain the following fields:
- `available_pipelines: List[str]` - list of pipelines provided by this module, shown as the list of available pipelines to the user
- `def get_pipeline(name: str, params: dict) -> Optional[AbstractMultimodalPipeline]`: - a function to get a concrete pipeline by `name`, if `name` doesn't match any, should return `None`. `params` is the user settings for multimodal extension
- `def get_pipeline_from_model_name(model_name: str, params: dict) -> Optional[AbstractMultimodalPipeline]`: - a function to get a pipeline from `model_name`, should be eager to return `None`, unless the determination can be done clearly (for example: minigpt-4 bases on vicuna - it should never return the pipeline, but llava can, as it has its own specific LLM finetune)
-
-**NOTE**: A pipeline module should lazy-import the pipelines only when necessary, and it should keep its imports to minimum
-
-## Pipeline params
-
-The pipelines will get the extension `params` in the constructor. They should honor the following fields:
- `vision_device` - string, specifying `torch.device` to run the vision model (CLIP/ViT) on
- `vision_bits` - int, number of fp bits to load the vision model(s) in
- `projector_device` - string, specifying `torch.device` to run the projector models (Linear layers, QFormer, etc.) on
- `projector_bits` - int, number of fp bits to load the projector models in
-
-As a helper, `AbstractMultimodalPipeline` has `_get_device(self, setting_name: str, params: dict)` and `_get_dtype(self, setting_name: str, params: dict)` helper functions, which parse string/int and return `torch.device` / `torch.dtype`.
--- a/extensions/multimodal/README.md
+++ b/extensions/multimodal/README.md
@ -1,139 +0,0 @@
-# Multimodal
-
-## Description
-
-Adds support for multimodality (text+images) to text-generation-webui.
-
-Note: multimodal currently only works for transformers, AutoGPTQ, and GPTQ-for-LLaMa loaders. ExLlama (v1 and v2) and llama.cpp support are planned.
-
-https://user-images.githubusercontent.com/3718215/233817203-69b57e77-0c55-4fd6-b742-3204bb13b8fc.mp4
-
-## Usage
-
-To run this extension, download a LLM that supports multimodality, and then start server.py with the appropriate `--multimodal-pipeline` argument. Examples:
-
-```
-# LLaVA 1.5 13B has the best performance
-python server.py --model liuhaotian_llava-v1.5-13b --multimodal-pipeline llava-v1.5-13b --load-in-4bit
-# LLaVA 1.5 7B is relatively weaker, but requires less memory
-python server.py --model liuhaotian_llava-v1.5-7b --multimodal-pipeline llava-v1.5-7b --load-in-4bit
-python server.py --model TheBloke_llava-v1.5-13B-GPTQ_gptq-4bit-32g-actorder_True --multimodal-pipeline llava-v1.5-13b --disable_exllama --loader autogptq
-python server.py --model wojtab_llava-7b-v0-4bit-128g --multimodal-pipeline llava-7b
-python server.py --model wojtab_llava-13b-v0-4bit-128g --multimodal-pipeline llava-13b
-python server.py --model anon8231489123_vicuna-13b-GPTQ-4bit-128g --multimodal-pipeline minigpt4-13b
-python server.py --model llama-7b-4bit --multimodal-pipeline minigpt4-7b
-```
-
-There is built-in support for LLaVA-v0-13B, LLaVA-v0-7b, and LLaVA-v1.5-13B. To install `minigpt4`:
-
- clone https://github.com/Wojtab/minigpt-4-pipeline into `extensions/multimodal/pipelines`
- install the requirements.txt
-
-The same procedure should be used to install other pipelines, which can then be used with `--multimodal-pipeline [pipeline name]`. For additional multimodal pipelines refer to the compatibility section below.
-
-Do note, that each image takes up a considerable amount of tokens, so adjust `max_new_tokens` to be at most 1700 (recommended value is between 200 to 500), so the images don't get truncated.
-
-To send an image, just upload it to the extension field below chat, and send a prompt as always. The image will be added to the end of your message. If you wish to modify the placement, include a string `<image>` in your prompt.
-
-Additionally, there is *Embed all images, not only the last one* checkbox. It modifies the image embeddings, by default (if it's unchecked), all but the most recent images have their embeddings empty, so they are not fed to the network. It seems as if some multimodal networks consider the features in all images at the same time as if they were a single image. Due to this behavior, by default, the extension skips previous images. However, it can lead to sub-par generation on other pipelines. If you want to include all images, just tick this checkbox.
-
-## Compatibility
-
-As of now, the following multimodal pipelines are supported:
-|Pipeline|`--multimodal-pipeline`|Default LLM|LLM info(for the linked model)|Pipeline repository|
-|-|-|-|-|-|
-|[LLaVA 13B](https://github.com/haotian-liu/LLaVA)|`llava-13b`|[LLaVA 13B](https://huggingface.co/wojtab/llava-13b-v0-4bit-128g)|GPTQ 4-bit quant, old CUDA|built-in|
-|[LLaVA 7B](https://github.com/haotian-liu/LLaVA)|`llava-7b`|[LLaVA 7B](https://huggingface.co/wojtab/llava-7b-v0-4bit-128g)|GPTQ 4-bit quant, old CUDA|built-in|
-|[MiniGPT-4 7B](https://github.com/Vision-CAIR/MiniGPT-4)|`minigpt4-7b`|[Vicuna v0 7B](https://huggingface.co/TheBloke/vicuna-7B-GPTQ-4bit-128g)|GPTQ 4-bit quant, new format|[Wojtab/minigpt-4-pipeline](https://github.com/Wojtab/minigpt-4-pipeline)|
-|[MiniGPT-4 13B](https://github.com/Vision-CAIR/MiniGPT-4)|`minigpt4-13b`|[Vicuna v0 13B](https://huggingface.co/anon8231489123/vicuna-13b-GPTQ-4bit-128g)|GPTQ 4-bit quant, old CUDA|[Wojtab/minigpt-4-pipeline](https://github.com/Wojtab/minigpt-4-pipeline)|
-|[InstructBLIP 7B](https://github.com/salesforce/LAVIS/tree/main/projects/instructblip)|`instructblip-7b`|[Vicuna v1.1 7B](https://huggingface.co/TheBloke/vicuna-7B-1.1-GPTQ-4bit-128g)|GPTQ 4-bit quant|[kjerk/instructblip-pipeline](https://github.com/kjerk/instructblip-pipeline)|
-|[InstructBLIP 13B](https://github.com/salesforce/LAVIS/tree/main/projects/instructblip)|`instructblip-13b`|[Vicuna v1.1 13B](https://huggingface.co/TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g)|GPTQ 4-bit quant|[kjerk/instructblip-pipeline](https://github.com/kjerk/instructblip-pipeline)|
-
-Some pipelines could support different LLMs but do note that while it might work, it isn't a supported configuration.
-
-DO NOT report bugs if you are using a different LLM.
-
-DO NOT report bugs with pipelines in this repository (unless they are built-in)
-
-## Extension config
-This extension uses the following parameters (from `settings.json`):
-|Parameter|Description|
-|---------|-----------|
-|`multimodal-vision_bits`|Number of bits to load vision models (CLIP/ViT) feature extractor in (most pipelines should support either 32 or 16, default=32)|
-|`multimodal-vision_device`|Torch device to run the feature extractor on, for example, `cpu` or `cuda:0`, by default `cuda:0` if available|
-|`multimodal-projector_bits`|Number of bits to load feature projector model(s) in (most pipelines should support either 32 or 16, default=32)|
-|`multimodal-projector_device`|Torch device to run the feature projector model(s) on, for example `cpu` or `cuda:0`, by default `cuda:0` if available|
-|`multimodal-add_all_images_to_prompt`|Default value of "Embed all images, not only the last one" checkbox|
-
-## Usage through API
-
-### Chat completions endpoint
-
-#### With an image URL
-
-```shell
-curl http://127.0.0.1:5000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "messages": [
-      {
-        "role": "user",
-        "image_url": "https://avatars.githubusercontent.com/u/112222186?v=4"
-      },
-      {
-        "role": "user",
-        "content": "What is unusual about this image?"
-      }
-    ]
-  }'
-```
-
-#### With a Base64 image
-
-```python
-import base64
-import json
-import requests
-
-img = open('image.jpg', 'rb')
-img_bytes = img.read()
-img_base64 = base64.b64encode(img_bytes).decode('utf-8')
-data = { "messages": [
-        {
-            "role": "user",
-            "image_url": f"data:image/jpeg;base64,{img_base64}"
-        },
-        {
-            "role": "user",
-            "content": "what is unusual about this image?"
-        }
-    ]
-}
-response = requests.post('http://127.0.0.1:5000/v1/chat/completions', json=data)
-print(response.text)
-```
-
-You can run the multimodal inference through API, by inputting the images to prompt. Images are embedded like so: `f'<img src="data:image/jpeg;base64,{img_str}">'`, where `img_str` is base-64 jpeg data. Note that you will need to launch `server.py` with the arguments `--api --extensions multimodal`. 
-
-### Completions endpoint
-
-Python example:
-
-```Python
-import base64
-import requests
-
-CONTEXT = "You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language. Follow the instructions carefully and explain your answers in detail.### Human: Hi!### Assistant: Hi there! How can I help you today?\n"
-
-with open('extreme_ironing.jpg', 'rb') as f:
-    img_str = base64.b64encode(f.read()).decode('utf-8')
-    prompt = CONTEXT + f'### Human: What is unusual about this image: \n<img src="data:image/jpeg;base64,{img_str}">### Assistant: '
-    print(requests.post('http://127.0.0.1:5000/v1/completions', json={'prompt': prompt, 'max_tokens': 200, 'stop': ['\n###']}).json())
-```
-script output:
-```Python
-{'results': [{'text': "The unusual aspect of this image is that a man is standing on top of a yellow minivan while doing his laundry. He has set up a makeshift clothes line using the car's rooftop as an outdoor drying area. This scene is uncommon because people typically do their laundry indoors, in a dedicated space like a laundromat or a room in their home, rather than on top of a moving vehicle. Additionally, hanging clothes on the car could be potentially hazardous or illegal in some jurisdictions due to the risk of damaging the vehicle or causing accidents on the road.\n##"}]}
-```
-
-## For pipeline developers/technical description
-see [DOCS.md](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/multimodal/DOCS.md)
--- a/extensions/multimodal/abstract_pipeline.py
+++ b/extensions/multimodal/abstract_pipeline.py
@ -1,63 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import List, Optional
-
-import torch
-from PIL import Image
-from transformers import is_torch_xpu_available
-
-
-class AbstractMultimodalPipeline(ABC):
-    @staticmethod
-    @abstractmethod
-    def name() -> str:
-        'name of the pipeline, should be same as in --multimodal-pipeline'
-        pass
-
-    @staticmethod
-    @abstractmethod
-    def image_start() -> Optional[str]:
-        'return image start string, string representation of image start token, or None if not applicable'
-        pass
-
-    @staticmethod
-    @abstractmethod
-    def image_end() -> Optional[str]:
-        'return image end string, string representation of image end token, or None if not applicable'
-        pass
-
-    @staticmethod
-    @abstractmethod
-    def placeholder_token_id() -> int:
-        'return placeholder token id'
-        pass
-
-    @staticmethod
-    @abstractmethod
-    def num_image_embeds() -> int:
-        'return the number of embeds used by a single image (for example: 256 for LLaVA)'
-        pass
-
-    @abstractmethod
-    def embed_images(self, images: List[Image.Image]) -> torch.Tensor:
-        'forward the images through vision pipeline, and return their embeddings'
-        pass
-
-    @staticmethod
-    @abstractmethod
-    def embed_tokens(input_ids: torch.Tensor) -> torch.Tensor:
-        'embed tokens, the exact function varies by LLM, for LLaMA it is `shared.model.model.embed_tokens`'
-        pass
-
-    @staticmethod
-    @abstractmethod
-    def placeholder_embeddings() -> torch.Tensor:
-        'get placeholder embeddings if there are multiple images, and `add_all_images_to_prompt` is False'
-        pass
-
-    def _get_device(self, setting_name: str, params: dict):
-        if params[setting_name] is None:
-            return torch.device("cuda:0" if torch.cuda.is_available() else "xpu:0" if is_torch_xpu_available() else "cpu")
-        return torch.device(params[setting_name])
-
-    def _get_dtype(self, setting_name: str, params: dict):
-        return torch.float32 if int(params[setting_name]) == 32 else torch.float16
--- a/extensions/multimodal/multimodal_embedder.py
+++ b/extensions/multimodal/multimodal_embedder.py
@ -1,178 +0,0 @@
-import base64
-import re
-from dataclasses import dataclass
-from io import BytesIO
-from typing import Any, List, Optional
-
-import torch
-from PIL import Image
-
-from extensions.multimodal.pipeline_loader import load_pipeline
-from modules import shared
-from modules.logging_colors import logger
-from modules.text_generation import encode, get_max_prompt_length
-
-
-@dataclass
-class PromptPart:
-    text: str
-    image: Optional[Image.Image] = None
-    is_image: bool = False
-    input_ids: Optional[torch.Tensor] = None
-    embedding: Optional[torch.Tensor] = None
-
-
-class MultimodalEmbedder:
-    def __init__(self, params: dict):
-        pipeline, source = load_pipeline(params)
-        self.pipeline = pipeline
-        logger.info(f'Multimodal: loaded pipeline {self.pipeline.name()} from pipelines/{source} ({self.pipeline.__class__.__name__})')
-
-    def _split_prompt(self, prompt: str, load_images: bool = False) -> List[PromptPart]:
-        """Splits a prompt into a list of `PromptParts` to separate image data from text.
-        It will also append `image_start` and `image_end` before and after the image, and optionally parse and load the images,
-        if `load_images` is `True`.
-        """
-        parts: List[PromptPart] = []
-        curr = 0
-        while True:
-            match = re.search(r'<img src="data:image/jpeg;base64,([A-Za-z0-9+/=]+)">', prompt[curr:])
-            if match is None:
-                # no more image tokens, append the rest of the prompt
-                if curr > 0:
-                    # add image end token after last image
-                    parts.append(PromptPart(text=self.pipeline.image_end() + prompt[curr:]))
-                else:
-                    parts.append(PromptPart(text=prompt))
-                break
-            # found an image, append image start token to the text
-            if match.start() > 0:
-                parts.append(PromptPart(text=prompt[curr:curr + match.start()] + self.pipeline.image_start()))
-            else:
-                parts.append(PromptPart(text=self.pipeline.image_start()))
-            # append the image
-            parts.append(PromptPart(
-                text=match.group(0),
-                image=Image.open(BytesIO(base64.b64decode(match.group(1)))) if load_images else None,
-                is_image=True
-            ))
-            curr += match.end()
-        return parts
-
-    def _len_in_tokens_prompt_parts(self, parts: List[PromptPart]) -> int:
-        """Total length in tokens of all `parts`"""
-        tokens = 0
-        for part in parts:
-            if part.is_image:
-                tokens += self.pipeline.num_image_embeds()
-            elif part.input_ids is not None:
-                tokens += len(part.input_ids)
-            else:
-                tokens += len(encode(part.text)[0])
-        return tokens
-
-    def len_in_tokens(self, prompt: str) -> int:
-        """Total length in tokens for a given text `prompt`"""
-        parts = self._split_prompt(prompt, False)
-        return self._len_in_tokens_prompt_parts(parts)
-
-    def _encode_single_text(self, part: PromptPart, add_bos_token: bool) -> PromptPart:
-        """Encode a single prompt `part` to `input_ids`. Returns a `PromptPart`"""
-        if part.is_image:
-            placeholders = torch.ones((self.pipeline.num_image_embeds())) * self.pipeline.placeholder_token_id()
-            part.input_ids = placeholders.to(shared.model.device, dtype=torch.int64)
-        else:
-            part.input_ids = encode(part.text, add_bos_token=add_bos_token)[0].to(shared.model.device, dtype=torch.int64)
-        return part
-
-    @staticmethod
-    def _num_images(parts: List[PromptPart]) -> int:
-        count = 0
-        for part in parts:
-            if part.is_image:
-                count += 1
-        return count
-
-    def _encode_text(self, state, parts: List[PromptPart]) -> List[PromptPart]:
-        """Encode text to token_ids, also truncate the prompt, if necessary.
-
-        The chat/instruct mode should make prompts that fit in get_max_prompt_length, but if max_new_tokens are set
-        such that the context + min_rows don't fit, we can get a prompt which is too long.
-        We can't truncate image embeddings, as it leads to broken generation, so remove the images instead and warn the user
-        """
-        encoded: List[PromptPart] = []
-        for i, part in enumerate(parts):
-            encoded.append(self._encode_single_text(part, i == 0 and state['add_bos_token']))
-
-        # truncation:
-        max_len = get_max_prompt_length(state)
-        removed_images = 0
-
-        # 1. remove entire text/image blocks
-        while self._len_in_tokens_prompt_parts(encoded[1:]) > max_len:
-            if encoded[0].is_image:
-                removed_images += 1
-            encoded = encoded[1:]
-
-        # 2. check if the last prompt part doesn't need to get truncated
-        if self._len_in_tokens_prompt_parts(encoded) > max_len:
-            if encoded[0].is_image:
-                # don't truncate image embeddings, just remove the image, otherwise generation will be broken
-                removed_images += 1
-                encoded = encoded[1:]
-            elif len(encoded) > 1 and encoded[0].text.endswith(self.pipeline.image_start()):
-                # see if we can keep image_start token
-                len_image_start = len(encode(self.pipeline.image_start(), add_bos_token=state['add_bos_token'])[0])
-                if self._len_in_tokens_prompt_parts(encoded[1:]) + len_image_start > max_len:
-                    # we can't -> remove this text, and the image
-                    encoded = encoded[2:]
-                    removed_images += 1
-                else:
-                    # we can -> just truncate the text
-                    trunc_len = self._len_in_tokens_prompt_parts(encoded) - max_len
-                    encoded[0].input_ids = encoded[0].input_ids[trunc_len:]
-            elif len(encoded) > 0:
-                # only one text left, truncate it normally
-                trunc_len = self._len_in_tokens_prompt_parts(encoded) - max_len
-                encoded[0].input_ids = encoded[0].input_ids[trunc_len:]
-
-        # notify user if we truncated an image
-        if removed_images > 0:
-            logger.warning(f"Multimodal: removed {removed_images} image(s) from prompt. Try decreasing max_new_tokens if generation is broken")
-
-        return encoded
-
-    def _embed(self, parts: List[PromptPart]) -> List[PromptPart]:
-        # batch images
-        image_indicies = [i for i, part in enumerate(parts) if part.is_image]
-        embedded = self.pipeline.embed_images([parts[i].image for i in image_indicies])
-        for i, embeds in zip(image_indicies, embedded):
-            parts[i].embedding = embeds
-        # embed text
-        for (i, part) in enumerate(parts):
-            if not part.is_image:
-                parts[i].embedding = self.pipeline.embed_tokens(part.input_ids)
-        return parts
-
-    def _remove_old_images(self, parts: List[PromptPart], params: dict) -> List[PromptPart]:
-        if params['add_all_images_to_prompt']:
-            return parts
-        already_added = False
-        for i, part in reversed(list(enumerate(parts))):
-            if part.is_image:
-                if already_added:
-                    parts[i].embedding = self.pipeline.placeholder_embeddings()
-                else:
-                    already_added = True
-        return parts
-
-    def forward(self, prompt: str, state: Any, params: dict):
-        prompt_parts = self._split_prompt(prompt, True)
-        prompt_parts = self._encode_text(state, prompt_parts)
-        prompt_parts = self._embed(prompt_parts)
-        prompt_parts = self._remove_old_images(prompt_parts, params)
-        embeds = tuple(part.embedding for part in prompt_parts)
-        ids = tuple(part.input_ids for part in prompt_parts)
-        input_embeds = torch.cat(embeds, dim=0)
-        input_ids = torch.cat(ids, dim=0)
-        return prompt, input_ids, input_embeds, self._num_images(prompt_parts)
--- a/extensions/multimodal/pipeline_loader.py
+++ b/extensions/multimodal/pipeline_loader.py
@ -1,52 +0,0 @@
-import traceback
-from importlib import import_module
-from pathlib import Path
-from typing import Tuple
-
-from extensions.multimodal.abstract_pipeline import AbstractMultimodalPipeline
-from modules import shared
-from modules.logging_colors import logger
-
-
-def _get_available_pipeline_modules():
-    pipeline_path = Path(__file__).parent / 'pipelines'
-    modules = [p for p in pipeline_path.iterdir() if p.is_dir()]
-    return [m.name for m in modules if (m / 'pipelines.py').exists()]
-
-
-def load_pipeline(params: dict) -> Tuple[AbstractMultimodalPipeline, str]:
-    pipeline_modules = {}
-    available_pipeline_modules = _get_available_pipeline_modules()
-    for name in available_pipeline_modules:
-        try:
-            pipeline_modules[name] = import_module(f'extensions.multimodal.pipelines.{name}.pipelines')
-        except:
-            logger.warning(f'Failed to get multimodal pipelines from {name}')
-            logger.warning(traceback.format_exc())
-
-    if shared.args.multimodal_pipeline is not None:
-        for k in pipeline_modules:
-            if hasattr(pipeline_modules[k], 'get_pipeline'):
-                pipeline = getattr(pipeline_modules[k], 'get_pipeline')(shared.args.multimodal_pipeline, params)
-                if pipeline is not None:
-                    return (pipeline, k)
-    else:
-        model_name = shared.args.model.lower()
-        for k in pipeline_modules:
-            if hasattr(pipeline_modules[k], 'get_pipeline_from_model_name'):
-                pipeline = getattr(pipeline_modules[k], 'get_pipeline_from_model_name')(model_name, params)
-                if pipeline is not None:
-                    return (pipeline, k)
-
-    available = []
-    for k in pipeline_modules:
-        if hasattr(pipeline_modules[k], 'available_pipelines'):
-            pipelines = getattr(pipeline_modules[k], 'available_pipelines')
-            available += pipelines
-
-    if shared.args.multimodal_pipeline is not None:
-        log = f'Multimodal - ERROR: Failed to load multimodal pipeline "{shared.args.multimodal_pipeline}", available pipelines are: {available}.'
-    else:
-        log = f'Multimodal - ERROR: Failed to determine multimodal pipeline for model {shared.args.model}, please select one manually using --multimodal-pipeline [PIPELINE]. Available pipelines are: {available}.'
-    logger.critical(f'{log} Please specify a correct pipeline, or disable the extension')
-    raise RuntimeError(f'{log} Please specify a correct pipeline, or disable the extension')
--- a/extensions/multimodal/pipelines/llava/README.md
+++ b/extensions/multimodal/pipelines/llava/README.md
@ -1,9 +0,0 @@
-## LLaVA pipeline
-
-This module provides 2 pipelines:
- `llava-7b` - for use with LLaVA v0 7B model (finetuned LLaMa 7B)
- `llava-13b` - for use with LLaVA v0 13B model (finetuned LLaMa 13B)
-
-[LLaVA](https://github.com/haotian-liu/LLaVA) uses CLIP `openai/clip-vit-large-patch14` as the vision model, and then a single linear layer. For 13B the projector weights are in `liuhaotian/LLaVA-13b-delta-v0`, and for 7B they are in `liuhaotian/LLaVA-7b-delta-v0`.
-
-The supported parameter combinations for both the vision model, and the projector are: CUDA/32bit, CUDA/16bit, CPU/32bit
--- a/extensions/multimodal/pipelines/llava/llava.py
+++ b/extensions/multimodal/pipelines/llava/llava.py
@ -1,262 +0,0 @@
-import time
-from abc import abstractmethod
-from typing import List, Tuple
-
-import torch
-from huggingface_hub import hf_hub_download
-from PIL import Image
-from transformers import CLIPImageProcessor, CLIPVisionModel
-
-from extensions.multimodal.abstract_pipeline import AbstractMultimodalPipeline
-from modules import shared
-from modules.logging_colors import logger
-from modules.text_generation import encode
-
-
-def expand2square(pil_img: Image.Image, background_color: Tuple[int]) -> Image.Image:
-    width, height = pil_img.size
-    if width == height:
-        return pil_img
-    elif width > height:
-        result = Image.new(pil_img.mode, (width, width), background_color)
-        result.paste(pil_img, (0, (width - height) // 2))
-        return result
-    else:
-        result = Image.new(pil_img.mode, (height, height), background_color)
-        result.paste(pil_img, ((height - width) // 2, 0))
-        return result
-
-
-class LLaVA_v0_Pipeline(AbstractMultimodalPipeline):
-    CLIP_REPO = "openai/clip-vit-large-patch14"
-
-    def __init__(self, params: dict) -> None:
-        super().__init__()
-        self.clip_device = self._get_device("vision_device", params)
-        self.clip_dtype = self._get_dtype("vision_bits", params)
-        self.projector_device = self._get_device("projector_device", params)
-        self.projector_dtype = self._get_dtype("projector_bits", params)
-        self.image_processor, self.vision_tower, self.mm_projector = self._load_models()
-
-    def _load_models(self):
-        start_ts = time.time()
-
-        logger.info(f"LLaVA - Loading CLIP from {self.CLIP_REPO} as {self.clip_dtype} on {self.clip_device}...")
-        image_processor = CLIPImageProcessor.from_pretrained(self.CLIP_REPO, torch_dtype=self.clip_dtype)
-        vision_tower = CLIPVisionModel.from_pretrained(self.CLIP_REPO, torch_dtype=self.clip_dtype).to(self.clip_device)
-
-        logger.info(f"LLaVA - Loading projector from {self.llava_projector_repo()} as {self.projector_dtype} on {self.projector_device}...")
-        projector_path = hf_hub_download(self.llava_projector_repo(), self.llava_projector_filename())
-        mm_projector = self.build_mm_projector()
-        projector_data = torch.load(projector_path)
-        projector_data = {k[19:]: v for k, v in projector_data.items() if k.startswith('model.mm_projector.')}
-        mm_projector.load_state_dict(projector_data)
-        mm_projector = mm_projector.to(self.projector_device)
-
-        logger.info(f"LLaVA supporting models loaded, took {time.time() - start_ts:.2f} seconds")
-        return image_processor, vision_tower, mm_projector
-
-    def build_mm_projector(self) -> torch.nn.Module:
-        projector_shape = self.llava_projector_shape()
-        if len(projector_shape) == 2:
-            return torch.nn.Linear(*projector_shape)
-        else:
-            modules = []
-            modules.append(torch.nn.Linear(projector_shape[0], projector_shape[1]))
-            for i in range(2, len(projector_shape)):
-                modules.append(torch.nn.GELU())
-                modules.append(torch.nn.Linear(projector_shape[i-1], projector_shape[i]))
-            return torch.nn.Sequential(*modules)
-
-    @staticmethod
-    def image_start() -> str:
-        return "<im_start>"
-
-    @staticmethod
-    def image_end() -> str:
-        return "<im_end>"
-
-    @staticmethod
-    def num_image_embeds() -> int:
-        return 256
-
-    @staticmethod
-    def embed_tokens(input_ids: torch.Tensor) -> torch.Tensor:
-        for attr in ['', 'model', 'model.model', 'model.model.model']:
-            tmp = getattr(shared.model, attr, None) if attr != '' else shared.model
-            if tmp is not None and hasattr(tmp, 'embed_tokens'):
-                func = tmp.embed_tokens
-                break
-        else:
-            raise ValueError('The embed_tokens method has not been found for this loader.')
-
-        return func(input_ids).to(shared.model.device, dtype=shared.model.dtype)
-
-    @staticmethod
-    def placeholder_embeddings() -> torch.Tensor:
-        return LLaVA_v0_Pipeline.embed_tokens(encode("<im_patch>"*256, add_bos_token=False)[0])
-
-    def embed_images(self, images: List[Image.Image]) -> torch.Tensor:
-        images = self.image_processor(images, return_tensors='pt')['pixel_values']
-        images = images.to(self.clip_device, dtype=self.clip_dtype)
-
-        with torch.no_grad():
-            image_forward_outs = self.vision_tower(images, output_hidden_states=True)
-            select_hidden_state_layer = -2
-            select_hidden_state = image_forward_outs.hidden_states[select_hidden_state_layer]
-            image_features = select_hidden_state[:, 1:].to(self.projector_device, dtype=self.projector_dtype)
-            image_features = self.mm_projector(image_features)
-        return image_features.to(shared.model.device, dtype=shared.model.dtype)
-
-    @staticmethod
-    @abstractmethod
-    def llava_projector_repo() -> str:
-        pass
-
-    @staticmethod
-    @abstractmethod
-    def llava_projector_filename() -> str:
-        pass
-
-    @staticmethod
-    @abstractmethod
-    def llava_projector_shape() -> Tuple[int, int]:
-        pass
-
-
-class LLaVA_v0_13B_Pipeline(LLaVA_v0_Pipeline):
-    def __init__(self, params: dict) -> None:
-        super().__init__(params)
-
-    @staticmethod
-    def name() -> str:
-        return "llava-13b"
-
-    @staticmethod
-    def placeholder_token_id() -> int:
-        return 32000
-
-    @staticmethod
-    def llava_projector_shape() -> Tuple[int, int]:
-        return (1024, 5120)
-
-    @staticmethod
-    def llava_projector_filename() -> str:
-        return "mm_projector.bin"
-
-    @staticmethod
-    def llava_projector_repo() -> str:
-        return "liuhaotian/LLaVA-13b-delta-v0"
-
-
-class LLaVA_v0_7B_Pipeline(LLaVA_v0_Pipeline):
-    def __init__(self, params: dict) -> None:
-        super().__init__(params)
-
-    @staticmethod
-    def name() -> str:
-        return "llava-7b"
-
-    @staticmethod
-    def placeholder_token_id() -> int:
-        return 32001
-
-    @staticmethod
-    def llava_projector_shape() -> Tuple[int, int]:
-        return (1024, 4096)
-
-    @staticmethod
-    def llava_projector_filename() -> str:
-        return "mm_projector.bin"
-
-    @staticmethod
-    def llava_projector_repo() -> str:
-        return "liuhaotian/LLaVA-7b-delta-v0"
-
-
-class LLaVA_LLaMA_2_13B_Pipeline(LLaVA_v0_13B_Pipeline):
-    def __init__(self, params: dict) -> None:
-        super().__init__(params)
-
-    @staticmethod
-    def name() -> str:
-        return "llava-llama-2-13b"
-
-    @staticmethod
-    def placeholder_token_id() -> int:
-        return 0
-
-    @staticmethod
-    def llava_projector_repo() -> str:
-        return "liuhaotian/llava-llama-2-13b-chat-lightning-preview"
-
-    @staticmethod
-    def image_start() -> str:
-        return ""
-
-    @staticmethod
-    def image_end() -> str:
-        return ""
-
-    @staticmethod
-    def placeholder_embeddings() -> torch.Tensor:
-        return LLaVA_v0_Pipeline.embed_tokens(encode("<unk>"*256, add_bos_token=False)[0])
-
-
-class LLaVA_v1_5_13B_Pipeline(LLaVA_v0_13B_Pipeline):
-    CLIP_REPO = "openai/clip-vit-large-patch14-336"
-
-    def __init__(self, params: dict) -> None:
-        super().__init__(params)
-
-    @staticmethod
-    def name() -> str:
-        return "llava-v1.5-13b"
-
-    @staticmethod
-    def llava_projector_shape() -> Tuple[int, int]:
-        return (1024, 5120, 5120)
-
-    @staticmethod
-    def placeholder_token_id() -> int:
-        return 0
-
-    @staticmethod
-    def llava_projector_repo() -> str:
-        return "liuhaotian/llava-v1.5-13b"
-
-    @staticmethod
-    def image_start() -> str:
-        return ""
-
-    @staticmethod
-    def image_end() -> str:
-        return ""
-
-    @staticmethod
-    def num_image_embeds() -> int:
-        return 576
-
-    def embed_images(self, images: List[Image.Image]) -> torch.Tensor:
-        # pad it to square first
-        images = [
-            expand2square(image, tuple(int(x*255) for x in self.image_processor.image_mean))
-            for image in images
-        ]
-        return super().embed_images(images)
-
-    @staticmethod
-    def placeholder_embeddings() -> torch.Tensor:
-        return LLaVA_v0_Pipeline.embed_tokens(encode("<unk>"*576, add_bos_token=False)[0])
-
-class LLaVA_v1_5_7B_Pipeline(LLaVA_v1_5_13B_Pipeline):
-    @staticmethod
-    def name() -> str:
-        return "llava-v1.5-7b"
-
-    @staticmethod
-    def llava_projector_shape() -> Tuple[int, int]:
-        return (1024, 4096, 4096)
-    @staticmethod
-    def llava_projector_repo() -> str:
-        return "liuhaotian/llava-v1.5-7b"
--- a/extensions/multimodal/pipelines/llava/pipelines.py
+++ b/extensions/multimodal/pipelines/llava/pipelines.py
@ -1,48 +0,0 @@
-from typing import Optional
-
-from extensions.multimodal.abstract_pipeline import AbstractMultimodalPipeline
-
-available_pipelines = ['llava-7b', 'llava-13b', 'llava-llama-2-13b', 'llava-v1.5-13b', 'llava-v1.5-7b']
-
-
-def get_pipeline(name: str, params: dict) -> Optional[AbstractMultimodalPipeline]:
-    if name == 'llava-7b':
-        from .llava import LLaVA_v0_7B_Pipeline
-        return LLaVA_v0_7B_Pipeline(params)
-    if name == 'llava-13b':
-        from .llava import LLaVA_v0_13B_Pipeline
-        return LLaVA_v0_13B_Pipeline(params)
-    if name == 'llava-llama-2-13b':
-        from .llava import LLaVA_LLaMA_2_13B_Pipeline
-        return LLaVA_LLaMA_2_13B_Pipeline(params)
-    if name == 'llava-v1.5-7b':
-        from .llava import LLaVA_v1_5_7B_Pipeline
-        return LLaVA_v1_5_7B_Pipeline(params)
-    if name == 'llava-v1.5-13b':
-        from .llava import LLaVA_v1_5_13B_Pipeline
-        return LLaVA_v1_5_13B_Pipeline(params)
-    return None
-
-
-def get_pipeline_from_model_name(model_name: str, params: dict) -> Optional[AbstractMultimodalPipeline]:
-    if 'llava' not in model_name.lower():
-        return None
-    if 'llama-2' in model_name.lower():
-        if '13b' in model_name.lower():
-            from .llava import LLaVA_LLaMA_2_13B_Pipeline
-            return LLaVA_LLaMA_2_13B_Pipeline(params)
-    elif 'llava-v1.5' in model_name.lower():
-        if '13b' in model_name.lower():
-            from .llava import LLaVA_v1_5_13B_Pipeline
-            return LLaVA_v1_5_13B_Pipeline(params)
-        if '7b' in model_name.lower():
-            from .llava import LLaVA_v1_5_7B_Pipeline
-            return LLaVA_v1_5_7B_Pipeline(params)
-    else:
-        if '7b' in model_name.lower():
-            from .llava import LLaVA_v0_7B_Pipeline
-            return LLaVA_v0_7B_Pipeline(params)
-        if '13b' in model_name.lower():
-            from .llava import LLaVA_v0_13B_Pipeline
-            return LLaVA_v0_13B_Pipeline(params)
-    return None
--- a/extensions/multimodal/pipelines/place-additional-pipelines-here.txt
+++ b/extensions/multimodal/pipelines/place-additional-pipelines-here.txt
--- a/extensions/multimodal/script.py
+++ b/extensions/multimodal/script.py
@ -1,113 +0,0 @@
-import base64
-import re
-import time
-from functools import partial
-from io import BytesIO
-
-import gradio as gr
-import torch
-
-from extensions.multimodal.multimodal_embedder import MultimodalEmbedder
-from modules import shared
-from modules.logging_colors import logger
-
-params = {
-    "add_all_images_to_prompt": False,
-    # device to run vision encoder on
-    "vision_device": None,
-    # bits to load vision encoder in, either 16 or 32
-    "vision_bits": 32,
-    # device to run multimodal projector on
-    "projector_device": None,
-    # multimodal projector bits, either 32 or 16
-    "projector_bits": 32
-}
-
-
-# If 'state' is True, will hijack the next chat generation
-input_hijack = {
-    'state': False,
-    'value': ["", ""]
-}
-
-
-# initialized in ui, so that params are loaded from settings
-multimodal_embedder: MultimodalEmbedder = None
-
-
-def chat_input_modifier(text, visible_text, state):
-    global input_hijack
-    if input_hijack['state']:
-        input_hijack['state'] = False
-        return input_hijack['value'](text, visible_text)
-    else:
-        return text, visible_text
-
-
-def add_chat_picture(picture, text, visible_text):
-    # resize the image, so that shortest edge is at least 224 (size for CLIP), and at most 300 (to keep history manageable)
-    # Adjusted to 336 for the values here, due to the increased resolution in llava-v1.5
-    max_hw, min_hw = max(picture.size), min(picture.size)
-    aspect_ratio = max_hw / min_hw
-    shortest_edge = int(max(336 / aspect_ratio, 336))
-    longest_edge = int(shortest_edge * aspect_ratio)
-    w = shortest_edge if picture.width < picture.height else longest_edge
-    h = shortest_edge if picture.width >= picture.height else longest_edge
-    picture = picture.resize((w, h))
-
-    buffer = BytesIO()
-    picture.save(buffer, format="PNG")
-    img_str = base64.b64encode(buffer.getvalue()).decode('utf-8')
-    image = f'<img src="data:image/jpeg;base64,{img_str}">'
-
-    if '<image>' in text:
-        text = text.replace('<image>', image)
-    else:
-        text = image + '\n' + text
-
-    if visible_text == '' or visible_text is None:
-        visible_text = text
-    elif '<image>' in visible_text:
-        visible_text = visible_text.replace('<image>', image)
-    else:
-        visible_text = visible_text + '\n' + image
-
-    return text, visible_text
-
-
-def custom_tokenized_length(prompt):
-    return multimodal_embedder.len_in_tokens(prompt)
-
-
-def tokenizer_modifier(state, prompt, input_ids, input_embeds):
-    global params
-    start_ts = time.time()
-    image_match = re.search(r'<img src="data:image/jpeg;base64,[A-Za-z0-9+/=]+">', prompt)
-
-    if image_match is None:
-        return prompt, input_ids, input_embeds
-
-    prompt, input_ids, input_embeds, total_embedded = multimodal_embedder.forward(prompt, state, params)
-    logger.info(f'Embedded {total_embedded} image(s) in {time.time()-start_ts:.2f}s')
-    return (prompt,
-            input_ids.unsqueeze(0).to(shared.model.device, dtype=torch.int64),
-            input_embeds.unsqueeze(0).to(shared.model.device, dtype=shared.model.dtype))
-
-
-def ui():
-    global multimodal_embedder
-    multimodal_embedder = MultimodalEmbedder(params)
-    with gr.Column():
-        picture_select = gr.Image(label='Send a picture', type='pil')
-        # The models don't seem to deal well with multiple images
-        single_image_checkbox = gr.Checkbox(False, label='Embed all images, not only the last one')
-    # Prepare the input hijack
-    picture_select.upload(
-        lambda picture: input_hijack.update({"state": True, "value": partial(add_chat_picture, picture)}),
-        [picture_select],
-        None
-    )
-    picture_select.clear(lambda: input_hijack.update({"state": False, "value": ["", ""]}), None, None)
-    single_image_checkbox.change(lambda x: params.update({"add_all_images_to_prompt": x}), single_image_checkbox, None)
-    shared.gradio['Generate'].click(lambda: None, None, picture_select)
-    shared.gradio['textbox'].submit(lambda: None, None, picture_select)
--- a/extensions/openai/logits.py
+++ b/extensions/openai/logits.py
@ -5,7 +5,5 @@ from modules.logits import get_next_logits
 def _get_next_logits(body):
    # Pre-process the input payload to simulate a real generation
    use_samplers = body['use_samplers']
-    state = process_parameters(body) if use_samplers else {}
-    state['stream'] = True
-
+    state = process_parameters(body)
    return get_next_logits(body['prompt'], state, use_samplers, "", top_logits=body['top_logits'], return_dict=True)
--- a/modules/cache_utils.py
+++ b/modules/cache_utils.py
@ -1,115 +0,0 @@
-import torch
-from numba import njit
-
-from modules import shared
-
-
-def process_llamacpp_cache(model, new_sequence, past_sequence):
-    if len(past_sequence) == 0 or len(new_sequence) == 0:
-        return past_sequence
-
-    i1, i2, j1, j2 = find_longest_common_substring_indices(past_sequence, new_sequence)
-    overlap_length = i2 - i1 + 1
-
-    # Do StreamingLLM if i1 > 0 (ie the longest common subsequence is not a prefix)
-    # and the overlap length is sufficiently long.
-    if i1 > 0 and overlap_length > 0.2 * len(new_sequence):
-
-        new_sequence = torch.tensor(new_sequence)
-        past_sequence = torch.tensor(past_sequence)
-
-        prefix_length = find_prefix_length(past_sequence[:i1], new_sequence[:j1])
-        sink_length = max(prefix_length, shared.args.attention_sink_size)
-        removed_length = i1 - sink_length
-
-        if removed_length <= 0:
-            return past_sequence.tolist()
-
-        matching_prefix = past_sequence[:prefix_length]
-        removed_chunk = past_sequence[sink_length:i1]
-        overlapping_sequence = new_sequence[j1:j2 + 1]
-        added_chunk = new_sequence[j2 + 1:]
-
-        # print(past_sequence.tolist())
-        # print(new_sequence.tolist())
-
-        print()
-        print('MATCHING PREFIX=', repr(shared.tokenizer.decode(matching_prefix)))
-        print('ADDED CHUNK=', repr(shared.tokenizer.decode(added_chunk)))
-        print('REMOVED CHUNK=', repr(shared.tokenizer.decode(removed_chunk)))
-        print('REMOVED LENGTH=', removed_length)
-        print()
-
-        # Remove interval [sink_length, sink_length + removed_length) from the context
-        # Update model.n_tokens
-        model._ctx.kv_cache_seq_rm(0, sink_length, sink_length + removed_length)
-        model._ctx.kv_cache_seq_shift(0, sink_length + removed_length, -1, -removed_length)
-
-        new_sequence = new_sequence.tolist()
-        model.input_ids[:j2 + 1] = new_sequence[:j2 + 1]
-        model.n_tokens = j2 + 1
-
-        return new_sequence[:j2 + 1]
-    else:
-        return past_sequence
-
-
-def find_prefix_length(past_seq, seq_tensor):
-    '''
-    Given two torch tensors, finds the length of the longest
-    common prefix between the two.
-    '''
-    min_length = min(past_seq.shape[0], seq_tensor.shape[0])
-    indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))
-    if len(indices) > 0:
-        prefix_length = indices[0].item()
-    else:
-        prefix_length = min_length
-
-    return prefix_length
-
-
-@njit
-def find_longest_common_substring_indices(list1, list2):
-    '''
-    Given two lists, solves the Longest Common Substring problem.
-
-    It returns the indices where the substring starts and ends in
-    s1 and s2.
-
-    Example:
-
-    ir, jr, ir2, jr2 = find_longest_common_substring_indices(s1, s2)
-    print(s1[ir:jr + 1])
-    print(s2[ir2:jr2 + 1])
-
-    Adapted from
-    https://rosettacode.org/wiki/Longest_common_substring#Python
-    '''
-
-    len_list1, len_list2 = len(list1), len(list2)
-    start_index_list1, end_index_list1 = 0, -1
-    start_index_list2, end_index_list2 = 0, -1
-
-    # for index1 in tqdm(range(0, len_list1), desc="StreamingLLM prompt comparison", leave=False):
-    for index1 in range(0, len_list1):
-        try:
-            index2 = list2.index(list1[index1])
-        except:
-            continue
-
-        while index2 >= 0:
-            temp_index1, temp_index2 = index1, index2
-            while temp_index1 < len_list1 and temp_index2 < len_list2 and list2[temp_index2] == list1[temp_index1]:
-                if temp_index1 - index1 >= end_index_list1 - start_index_list1:
-                    start_index_list1, end_index_list1 = index1, temp_index1
-                    start_index_list2, end_index_list2 = index2, temp_index2
-
-                temp_index1 += 1
-                temp_index2 += 1
-            try:
-                index2 = list2.index(list1[index1], index2 + 1)
-            except:
-                break
-
-    return start_index_list1, end_index_list1, start_index_list2, end_index_list2
--- a/modules/evaluate.py
+++ b/modules/evaluate.py
@ -40,17 +40,13 @@ def calculate_perplexity(models, input_dataset, stride, _max_length):
    '''

    if shared.args.loader == "llama.cpp":
-        logger.error("llamacpp_HF is required for perplexity evaluation with GGUF models. Please reload the model with llamacpp_HF instead of llama.cpp.")
+        logger.error("Perplexity evaluation is not implemented for the llama.cpp loader.")
        raise ValueError

    if shared.args.loader == "ExLlamav2":
        logger.error("ExLlamav2_HF is required for perplexity evaluation with EXL2 models. Please reload the model with ExLlamav2_HF instead of ExLlamav2.")
        raise ValueError

-    if shared.args.loader == "llamacpp_HF" and not shared.args.logits_all:
-        logger.error("--logits_all is required for perplexity evaluation with GGUF models. Please reload the model with that option set/checked.")
-        raise ValueError
-
    if not shared.args.no_use_fast:
        logger.warning("--no_use_fast is not set. If tokenizing the input dataset takes a long time, try reloading the model with that option set/checked.")

--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@ -157,6 +157,9 @@ class Exllamav2HF(PreTrainedModel):
        else:
            self.past_seq = seq_tensor

+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+
        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@ -153,6 +153,9 @@ class Exllamav3HF(PreTrainedModel):
        else:
            self.past_seq = seq_tensor

+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+
        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@ -391,8 +391,10 @@ def generate_chat_html(history, name1, name2, reset_cache=False):

 def chat_html_wrapper(history, name1, name2, mode, style, character, reset_cache=False):
    if mode == 'instruct':
-        return generate_instruct_html(history)
+        result = generate_instruct_html(history)
    elif style == 'wpp':
-        return generate_chat_html(history, name1, name2)
+        result = generate_chat_html(history, name1, name2)
    else:
-        return generate_cai_chat_html(history, name1, name2, style, character, reset_cache)
+        result = generate_cai_chat_html(history, name1, name2, style, character, reset_cache)
+
+    return {'html': result}
--- a/modules/llama_cpp_python_hijack.py
+++ b/modules/llama_cpp_python_hijack.py
@ -1,165 +0,0 @@
-import importlib
-import platform
-from typing import Sequence
-
-import numpy as np
-from tqdm import tqdm
-
-from modules import shared
-from modules.cache_utils import process_llamacpp_cache
-
-imported_module = None
-not_available_modules = set()
-
-
-def llama_cpp_lib():
-    global imported_module, not_available_modules
-
-    # Determine the platform
-    is_macos = platform.system() == 'Darwin'
-
-    # Define the library names based on the platform
-    if is_macos:
-        lib_names = [
-            (None, 'llama_cpp')
-        ]
-    else:
-        lib_names = [
-            ('cpu', 'llama_cpp'),
-            ('tensorcores', 'llama_cpp_cuda_tensorcores'),
-            (None, 'llama_cpp_cuda'),
-            (None, 'llama_cpp')
-        ]
-
-    for arg, lib_name in lib_names:
-        if lib_name in not_available_modules:
-            continue
-
-        should_import = (arg is None or getattr(shared.args, arg))
-
-        if should_import:
-            if imported_module and imported_module != lib_name:
-                # Conflict detected, raise an exception
-                raise Exception(f"Cannot import `{lib_name}` because `{imported_module}` is already imported. Switching to a different version of llama-cpp-python currently requires a server restart.")
-
-            try:
-                return_lib = importlib.import_module(lib_name)
-                imported_module = lib_name
-                monkey_patch_llama_cpp_python(return_lib)
-                return return_lib
-            except ImportError:
-                not_available_modules.add(lib_name)
-                continue
-
-    return None
-
-
-def eval_with_progress(self, tokens: Sequence[int]):
-    """
-    A copy of
-
-    https://github.com/abetlen/llama-cpp-python/blob/main/llama_cpp/llama.py
-
-    with tqdm to show prompt processing progress.
-    """
-    self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
-
-    if len(tokens) > self.n_batch:
-        progress_bar = tqdm(range(0, len(tokens), self.n_batch), desc="Prompt evaluation", leave=False)
-    else:
-        progress_bar = range(0, len(tokens), self.n_batch)
-
-    for i in progress_bar:
-        batch = tokens[i : min(len(tokens), i + self.n_batch)]
-        n_past = self.n_tokens
-        n_tokens = len(batch)
-        self._batch.set_batch(
-            batch=batch, n_past=n_past, logits_all=self.context_params.logits_all
-        )
-        self._ctx.decode(self._batch)
-        # Save tokens
-        self.input_ids[n_past : n_past + n_tokens] = batch
-        # Save logits
-        if self.context_params.logits_all:
-            rows = n_tokens
-            cols = self._n_vocab
-            logits = np.ctypeslib.as_array(
-                self._ctx.get_logits(), shape=(rows * cols,)
-            )
-            self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
-            self.last_updated_index = n_past + n_tokens - 1
-        else:
-            rows = 1
-            cols = self._n_vocab
-            logits = np.ctypeslib.as_array(
-                self._ctx.get_logits(), shape=(rows * cols,)
-            )
-            last_token_index = min(n_past + n_tokens - 1, self.scores.shape[0] - 1)
-            self.scores[last_token_index, :] = logits.reshape(-1)
-            self.last_updated_index = last_token_index
-        # Update n_tokens
-        self.n_tokens += n_tokens
-
-
-def monkey_patch_llama_cpp_python(lib):
-    if getattr(lib.Llama, '_is_patched', False):
-        # If the patch is already applied, do nothing
-        return
-
-    def my_generate(self, *args, **kwargs):
-        if shared.args.streaming_llm:
-            new_sequence = args[0]
-            past_sequence = self._input_ids
-
-            # Do the cache trimming for StreamingLLM
-            process_llamacpp_cache(self, new_sequence, past_sequence)
-
-        for output in self.original_generate(*args, **kwargs):
-            yield output
-
-    lib.Llama.eval = eval_with_progress
-    lib.Llama.original_generate = lib.Llama.generate
-    lib.Llama.generate = my_generate
-
-    # Also patch Jinja2ChatFormatter to handle loop controls
-    if hasattr(lib, 'llama_chat_format') and hasattr(lib.llama_chat_format, 'Jinja2ChatFormatter'):
-        Formatter = lib.llama_chat_format.Jinja2ChatFormatter
-
-        if not getattr(Formatter, '_is_patched', False):
-            def patched_init(self, *args, **kwargs):
-                # Extract parameters from args or kwargs
-                if args:
-                    self.template = args[0]
-                    self.eos_token = args[1] if len(args) > 1 else kwargs.get('eos_token')
-                    self.bos_token = args[2] if len(args) > 2 else kwargs.get('bos_token')
-                    self.add_generation_prompt = args[3] if len(args) > 3 else kwargs.get('add_generation_prompt', True)
-                    self.stop_token_ids = args[4] if len(args) > 4 else kwargs.get('stop_token_ids')
-                else:
-                    self.template = kwargs.get('template')
-                    self.eos_token = kwargs.get('eos_token')
-                    self.bos_token = kwargs.get('bos_token')
-                    self.add_generation_prompt = kwargs.get('add_generation_prompt', True)
-                    self.stop_token_ids = kwargs.get('stop_token_ids')
-
-                # Process stop tokens as in the original
-                self.stop_token_ids = (
-                    set(self.stop_token_ids) if self.stop_token_ids is not None else None
-                )
-
-                # Create environment with loopcontrols extension
-                import jinja2
-                from jinja2.ext import loopcontrols
-
-                self._environment = jinja2.sandbox.ImmutableSandboxedEnvironment(
-                    loader=jinja2.BaseLoader(),
-                    trim_blocks=True,
-                    lstrip_blocks=True,
-                    extensions=[loopcontrols]
-                ).from_string(self.template)
-
-            # Replace the original __init__ with our patched version
-            Formatter.__init__ = patched_init
-            Formatter._is_patched = True
-
-    # Set the flag to indicate that the patch has been applied
-    lib.Llama._is_patched = True
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -0,0 +1,339 @@
+import json
+import pprint
+import socket
+import subprocess
+import sys
+import threading
+import time
+
+import llama_cpp_binaries
+import requests
+
+from modules import shared
+from modules.logging_colors import logger
+
+llamacpp_valid_cache_types = {"fp16", "q8_0", "q4_0"}
+
+
+class LlamaServer:
+    def __init__(self, model_path, server_path=None):
+        """
+        Initialize and start a server for llama.cpp models.
+        """
+        self.model_path = model_path
+        self.server_path = server_path
+        self.port = self._find_available_port()
+        self.process = None
+        self.vocabulary_size = None
+        self.bos_token = "<s>"
+
+        # Start the server
+        self._start_server()
+
+    def encode(self, text, add_bos_token=False, **kwargs):
+        if self.bos_token and text.startswith(self.bos_token):
+            add_bos_token = False
+
+        url = f"http://localhost:{self.port}/tokenize"
+        payload = {
+            "content": text,
+            "add_special": add_bos_token,
+        }
+
+        response = requests.post(url, json=payload)
+        result = response.json()
+        return result.get("tokens", [])
+
+    def decode(self, token_ids, **kwargs):
+        url = f"http://localhost:{self.port}/detokenize"
+        payload = {
+            "tokens": token_ids,
+        }
+
+        response = requests.post(url, json=payload)
+        result = response.json()
+        return result.get("content", "")
+
+    def prepare_payload(self, state):
+        # Prepare DRY
+        dry_sequence_breakers = state['dry_sequence_breakers']
+        if not dry_sequence_breakers.startswith("["):
+            dry_sequence_breakers = "[" + dry_sequence_breakers + "]"
+        dry_sequence_breakers = json.loads(dry_sequence_breakers)
+
+        # Prepare the sampler order
+        samplers = state["sampler_priority"]
+        samplers = samplers.split("\n") if isinstance(samplers, str) else samplers
+        penalty_found = False
+        filtered_samplers = []
+        for s in samplers:
+            if s.strip() in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]:
+                filtered_samplers.append(s.strip())
+            elif not penalty_found and s.strip() == "repetition_penalty":
+                filtered_samplers.append("penalties")
+                penalty_found = True
+
+        samplers = filtered_samplers
+
+        # Move temperature to the end if temperature_last is true and temperature exists in the list
+        if state["temperature_last"] and "temperature" in samplers:
+            samplers.remove("temperature")
+            samplers.append("temperature")
+
+        payload = {
+            "temperature": state["temperature"] if not state["dynamic_temperature"] else (state["dynatemp_low"] + state["dynatemp_high"]) / 2,
+            "dynatemp_range": 0 if not state["dynamic_temperature"] else (state["dynatemp_high"] - state["dynatemp_low"]) / 2,
+            "dynatemp_exponent": state["dynatemp_exponent"],
+            "top_k": state["top_k"],
+            "top_p": state["top_p"],
+            "min_p": state["min_p"],
+            "tfs_z": state["tfs"],
+            "typical_p": state["typical_p"],
+            "repeat_penalty": state["repetition_penalty"],
+            "repeat_last_n": state["repetition_penalty_range"],
+            "presence_penalty": state["presence_penalty"],
+            "frequency_penalty": state["frequency_penalty"],
+            "dry_multiplier": state["dry_multiplier"],
+            "dry_base": state["dry_base"],
+            "dry_allowed_length": state["dry_allowed_length"],
+            "dry_penalty_last_n": state["repetition_penalty_range"],
+            "dry_sequence_breakers": dry_sequence_breakers,
+            "xtc_probability": state["xtc_probability"],
+            "xtc_threshold": state["xtc_threshold"],
+            "mirostat": state["mirostat_mode"],
+            "mirostat_tau": state["mirostat_tau"],
+            "mirostat_eta": state["mirostat_eta"],
+            "grammar": state["grammar_string"],
+            "seed": state["seed"],
+            "ignore_eos": state["ban_eos_token"],
+            "samplers": samplers,
+        }
+
+        if state['custom_token_bans']:
+            to_ban = [[int(token_id), False] for token_id in state['custom_token_bans'].split(',')]
+            payload["logit_bias"] = to_ban
+
+        return payload
+
+    def generate_with_streaming(
+        self,
+        prompt,
+        state,
+    ):
+        url = f"http://localhost:{self.port}/completion"
+        payload = self.prepare_payload(state)
+
+        token_ids = self.encode(prompt, add_bos_token=state["add_bos_token"])
+        if state['auto_max_new_tokens']:
+            max_new_tokens = state['truncation_length'] - len(token_ids)
+        else:
+            max_new_tokens = state['max_new_tokens']
+
+        payload.update({
+            "prompt": token_ids,
+            "n_predict": max_new_tokens,
+            "stream": True,
+        })
+
+        if shared.args.verbose:
+            logger.info("GENERATE_PARAMS=")
+            printable_payload = {k: v for k, v in payload.items() if k != "prompt"}
+            pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
+            print()
+
+        # Make a direct request with streaming enabled
+        response = requests.post(url, json=payload, stream=True)
+        response.raise_for_status()  # Raise an exception for HTTP errors
+
+        full_text = ""
+
+        # Process the streaming response
+        for line in response.iter_lines():
+            if shared.stop_everything:
+                break
+
+            if line:
+                try:
+                    # Check if the line starts with "data: " and remove it
+                    line_str = line.decode('utf-8')
+                    if line_str.startswith('data: '):
+                        line_str = line_str[6:]  # Remove the "data: " prefix
+
+                    # Parse the JSON data
+                    data = json.loads(line_str)
+
+                    # Extract the token content
+                    if 'content' in data:
+                        token_text = data['content']
+                        full_text += token_text
+                        yield full_text
+
+                    # Check if generation is complete
+                    if data.get('stop', False):
+                        break
+
+                except json.JSONDecodeError as e:
+                    # Log the error and the problematic line
+                    print(f"JSON decode error: {e}")
+                    print(f"Problematic line: {line}")
+                    continue
+
+    def get_logits(self, prompt, state, n_probs=128, use_samplers=False):
+        """Get the logits/probabilities for the next token after a prompt"""
+        url = f"http://localhost:{self.port}/completion"
+
+        payload = self.prepare_payload(state)
+        payload.update({
+            "prompt": self.encode(prompt, add_bos_token=state["add_bos_token"]),
+            "n_predict": 0,
+            "logprobs": True,
+            "n_probs": n_probs,
+            "stream": False,
+            "post_sampling_probs": use_samplers,
+        })
+
+        if shared.args.verbose:
+            logger.info("GENERATE_PARAMS=")
+            printable_payload = {k: v for k, v in payload.items() if k != "prompt"}
+            pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
+            print()
+
+        response = requests.post(url, json=payload)
+        result = response.json()
+
+        if "completion_probabilities" in result:
+            if use_samplers:
+                return result["completion_probabilities"][0]["top_probs"]
+            else:
+                return result["completion_probabilities"][0]["top_logprobs"]
+        else:
+            raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
+
+    def _get_vocabulary_size(self):
+        """Get and store the model's maximum context length."""
+        url = f"http://localhost:{self.port}/v1/models"
+        response = requests.get(url).json()
+
+        if "data" in response and len(response["data"]) > 0:
+            model_info = response["data"][0]
+            if "meta" in model_info and "n_vocab" in model_info["meta"]:
+                self.vocabulary_size = model_info["meta"]["n_vocab"]
+
+    def _get_bos_token(self):
+        """Get and store the model's BOS token."""
+        url = f"http://localhost:{self.port}/props"
+        response = requests.get(url).json()
+        if "bos_token" in response:
+            self.bos_token = response["bos_token"]
+
+    def _find_available_port(self):
+        """Find an available port by letting the OS assign one."""
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(('', 0))  # Bind to port 0 to get an available port
+            return s.getsockname()[1]
+
+    def _start_server(self):
+        """Start the llama.cpp server and wait until it's ready."""
+        # Determine the server path
+        if self.server_path is None:
+            self.server_path = llama_cpp_binaries.get_binary_path()
+
+        # Build the command
+        cmd = [
+            self.server_path,
+            "--model", self.model_path,
+            "--ctx-size", str(shared.args.n_ctx),
+            "--n-gpu-layers", str(shared.args.n_gpu_layers),
+            "--batch-size", str(shared.args.batch_size),
+            "--rope-freq-base", str(shared.args.rope_freq_base),
+            "--port", str(self.port),
+        ]
+
+        if shared.args.flash_attn:
+            cmd.append("--flash-attn")
+        if shared.args.threads > 0:
+            cmd += ["--threads", str(shared.args.threads)]
+        if shared.args.threads_batch > 0:
+            cmd += ["--threads-batch", str(shared.args.threads_batch)]
+        if shared.args.no_mmap:
+            cmd.append("--no-mmap")
+        if shared.args.mlock:
+            cmd.append("--mlock")
+        if shared.args.tensor_split:
+            cmd += ["--tensor-split", shared.args.tensor_split]
+        if shared.args.numa:
+            cmd += ["--numa", "distribute"]
+        if shared.args.no_kv_offload:
+            cmd.append("--no-kv-offload")
+        if shared.args.row_split:
+            cmd += ["--split-mode", "row"]
+        if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types:
+            cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type]
+        if shared.args.compress_pos_emb != 1:
+            cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
+
+        # Start the server with pipes for output
+        self.process = subprocess.Popen(
+            cmd,
+            stderr=subprocess.PIPE,
+            text=True,
+            bufsize=1
+        )
+
+        def filter_stderr():
+            for line in iter(self.process.stderr.readline, ''):
+                if not line.startswith(('srv ', 'slot ')) and 'log_server_r: request: GET /health' not in line:
+                    sys.stderr.write(line)
+                    sys.stderr.flush()
+
+        threading.Thread(target=filter_stderr, daemon=True).start()
+
+        # Wait for server to be healthy
+        health_url = f"http://localhost:{self.port}/health"
+        start_time = time.time()
+        timeout = 3600 * 8  # 8 hours
+        while time.time() - start_time < timeout:
+            # Check if process is still alive
+            if self.process.poll() is not None:
+                # Process has terminated
+                exit_code = self.process.poll()
+                raise RuntimeError(f"Server process terminated unexpectedly with exit code: {exit_code}")
+
+            try:
+                response = requests.get(health_url)
+                if response.status_code == 200:
+                    break
+            except:
+                pass
+
+            time.sleep(1)
+        else:
+            raise TimeoutError(f"Server health check timed out after {timeout} seconds")
+
+        # Server is now healthy, get model info
+        self._get_vocabulary_size()
+        self._get_bos_token()
+        return self.port
+
+    def __enter__(self):
+        """Support for context manager."""
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Support for context manager."""
+        self.stop()
+
+    def __del__(self):
+        """Cleanup when the object is deleted."""
+        self.stop()
+
+    def stop(self):
+        """Stop the server process."""
+        if self.process:
+            self.process.terminate()
+            try:
+                self.process.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                self.process.kill()
+
+            self.process = None
--- a/modules/llamacpp_hf.py
+++ b/modules/llamacpp_hf.py
@ -1,220 +0,0 @@
-import os
-from pathlib import Path
-from typing import Any, Dict, Optional, Union
-
-import torch
-from torch.nn import CrossEntropyLoss
-from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
-from modules import shared
-from modules.llama_cpp_python_hijack import llama_cpp_lib
-from modules.llamacpp_model import get_llamacpp_cache_type_for_string
-from modules.logging_colors import logger
-
-
-class LlamacppHF(PreTrainedModel):
-    def __init__(self, model, path):
-        super().__init__(PretrainedConfig())
-        self.model = model
-        self.generation_config = GenerationConfig()
-
-        self.past_seq = None
-        self.llamacpp_cache = {
-            'n_tokens': self.model.n_tokens,
-            'input_ids': self.model.input_ids,
-            'scores': self.model.scores,
-            'ctx': self.model._ctx.ctx
-        }
-
-        if shared.args.cfg_cache:
-            self.past_seq_negative = None
-            self.llamacpp_cache_negative = {
-                'n_tokens': self.model.n_tokens,
-                'input_ids': self.model.input_ids.copy(),
-                'scores': self.model.scores.copy(),
-                'ctx': llama_cpp_lib().llama_new_context_with_model(model.model, model.context_params)
-            }
-
-    def _validate_model_class(self):
-        pass
-
-    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
-        pass
-
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        return {'input_ids': input_ids, **kwargs}
-
-    def save_cache(self):
-        self.llamacpp_cache.update({
-            'n_tokens': self.model.n_tokens,
-            'input_ids': self.model.input_ids,
-            'scores': self.model.scores,
-            'ctx': self.model._ctx.ctx
-        })
-
-    def save_negative_cache(self):
-        self.llamacpp_cache_negative.update({
-            'n_tokens': self.model.n_tokens,
-            'input_ids': self.model.input_ids,
-            'scores': self.model.scores,
-            'ctx': self.model._ctx.ctx
-        })
-
-    def load_cache(self):
-        self.model.n_tokens = self.llamacpp_cache['n_tokens']
-        self.model.input_ids = self.llamacpp_cache['input_ids']
-        self.model.scores = self.llamacpp_cache['scores']
-        self.model._ctx.ctx = self.llamacpp_cache['ctx']
-
-    def load_negative_cache(self):
-        self.model.n_tokens = self.llamacpp_cache_negative['n_tokens']
-        self.model.input_ids = self.llamacpp_cache_negative['input_ids']
-        self.model.scores = self.llamacpp_cache_negative['scores']
-        self.model._ctx.ctx = self.llamacpp_cache_negative['ctx']
-
-    @property
-    def device(self) -> torch.device:
-        return torch.device(0)
-
-    def __call__(self, *args, **kwargs):
-        use_cache = kwargs.get('use_cache', True)
-        labels = kwargs.get('labels', None)
-        past_key_values = kwargs.get('past_key_values', None)
-
-        if len(args) > 0:
-            if not shared.args.cfg_cache:
-                logger.error("Please enable the cfg-cache option to use CFG with llamacpp_HF.")
-                return
-
-            input_ids = args[0]
-            is_negative = True
-            past_seq = self.past_seq_negative
-            self.load_negative_cache()
-        else:
-            input_ids = kwargs['input_ids']
-            is_negative = False
-            past_seq = self.past_seq
-            self.load_cache()
-
-        seq = input_ids[0].tolist()
-        if is_negative and past_key_values is not None:
-            seq = past_key_values + seq
-
-        seq_tensor = torch.tensor(seq)
-        reset = True
-
-        # Make the forward call. The prefix-match code has been adapted from
-        # https://github.com/abetlen/llama-cpp-python/commit/f4090a0bb2a2a25acfe28d31c82cc1aa273bedee
-        if labels is None:
-            if past_seq is not None:
-                min_length = min(past_seq.shape[0], seq_tensor.shape[0])
-                indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))
-                if len(indices) > 0:
-                    longest_prefix = indices[0].item()
-                else:
-                    longest_prefix = min_length
-
-                if longest_prefix > 0:
-                    reset = False
-                    self.model.n_tokens = longest_prefix
-                    if len(seq_tensor) - longest_prefix > 0:
-                        self.model.eval(seq[longest_prefix:])
-                    else:
-                        self.model.n_tokens -= 1
-                        self.model.eval([seq[-1]])
-
-            if reset:
-                self.model.reset()
-                self.model.eval(seq)
-
-            logits = torch.tensor(self.model.scores[self.model.last_updated_index, :]).view(1, 1, -1).to(input_ids.device)
-        else:
-            self.model.reset()
-            self.model.eval(seq)
-            logits = torch.tensor(self.model.eval_logits)
-            logits = logits.view(1, logits.shape[0], logits.shape[1]).to(input_ids.device)
-
-        if is_negative:
-            self.save_negative_cache()
-            self.past_seq_negative = seq_tensor
-        else:
-            self.save_cache()
-            self.past_seq = seq_tensor
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, logits.shape[-1])
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
-        assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported"
-
-        if isinstance(pretrained_model_name_or_path, str):
-            pretrained_model_name_or_path = Path(pretrained_model_name_or_path)
-
-        path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
-        if path.is_file():
-            model_file = path
-        else:
-            model_file = sorted(path.glob('*.gguf'))[0]
-
-        logger.info(f"llama.cpp weights detected: {model_file}\n")
-
-        if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
-            tensor_split_list = None
-        else:
-            tensor_split_list = [float(x) for x in shared.args.tensor_split.strip().split(",")]
-
-        params = {
-            'model_path': str(model_file),
-            'n_ctx': shared.args.n_ctx,
-            'n_threads': shared.args.threads or None,
-            'n_threads_batch': shared.args.threads_batch or None,
-            'n_batch': shared.args.n_batch,
-            'use_mmap': not shared.args.no_mmap,
-            'use_mlock': shared.args.mlock,
-            'mul_mat_q': not shared.args.no_mul_mat_q,
-            'numa': shared.args.numa,
-            'n_gpu_layers': shared.args.n_gpu_layers,
-            'rope_freq_base': shared.args.rope_freq_base,
-            'tensor_split': tensor_split_list,
-            'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
-            'logits_all': shared.args.logits_all,
-            'offload_kqv': not shared.args.no_offload_kqv,
-            'split_mode': 1 if not shared.args.row_split else 2,
-            'flash_attn': shared.args.flash_attn
-        }
-
-        if shared.args.cache_type != 'fp16':
-            params["type_k"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
-            params["type_v"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
-
-        Llama = llama_cpp_lib().Llama
-        try:
-            model = Llama(**params)
-        except Exception as e:
-            error_message = (
-                f"Failed loading the model. **This usually happens due to lack of memory**. Try these steps:\n"
-                f"1. Reduce the context length `n_ctx` (currently {shared.args.n_ctx})."
-                f"{' Try a lower value like 4096.' if shared.args.n_ctx > 4096 else '.'}"
-                "\n"
-                f"2. Lower the `n-gpu-layers` value (currently {shared.args.n_gpu_layers})."
-            )
-
-            raise type(e)(error_message) from e
-
-        model.last_updated_index = -1
-
-        return LlamacppHF(model, model_file)
--- a/modules/llamacpp_model.py
+++ b/modules/llamacpp_model.py
@ -1,218 +0,0 @@
-import re
-from functools import partial
-
-import numpy as np
-import torch
-
-from modules import shared
-from modules.callbacks import Iteratorize
-from modules.llama_cpp_python_hijack import llama_cpp_lib
-from modules.logging_colors import logger
-from modules.text_generation import get_max_prompt_length
-
-llamacpp_quant_mapping = {
-    'f32': 0,
-    'fp16': 1,
-    'q4_0': 2,
-    'q4_1': 3,
-    'q5_0': 6,
-    'q5_1': 7,
-    'q8_0': 8,
-    'q8_1': 9,
-    'q2_k': 10,
-    'q3_k': 11,
-    'q4_k': 12,
-    'q5_k': 13,
-    'q6_k': 14,
-    'q8_k': 15,
-    'iq4_nl': 20,
-    'bf16': 30,
-}
-
-llamacpp_valid_cache_types = {'fp16', 'q8_0', 'q4_0'}
-
-
-def get_llamacpp_cache_type_for_string(quant_type: str):
-    quant_type = quant_type.lower()
-    if quant_type in llamacpp_valid_cache_types:
-        return llamacpp_quant_mapping[quant_type]
-    else:
-        raise ValueError(f"Invalid cache type for llama.cpp: {quant_type}. Valid options are: fp16, q8_0, q4_0.")
-
-
-def ban_eos_logits_processor(eos_token, input_ids, logits):
-    logits[eos_token] = -float('inf')
-    return logits
-
-
-def custom_token_ban_logits_processor(token_ids, input_ids, logits):
-    for token_id in token_ids:
-        logits[token_id] = -float('inf')
-
-    return logits
-
-
-class LlamaCppModel:
-    def __init__(self):
-        self.initialized = False
-        self.grammar_string = ''
-        self.grammar = None
-
-    def __del__(self):
-        del self.model
-
-    @classmethod
-    def from_pretrained(self, path):
-
-        Llama = llama_cpp_lib().Llama
-        LlamaCache = llama_cpp_lib().LlamaCache
-
-        result = self()
-        cache_capacity = 0
-        if shared.args.cache_capacity is not None:
-            if 'GiB' in shared.args.cache_capacity:
-                cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000 * 1000
-            elif 'MiB' in shared.args.cache_capacity:
-                cache_capacity = int(re.sub('[a-zA-Z]', '', shared.args.cache_capacity)) * 1000 * 1000
-            else:
-                cache_capacity = int(shared.args.cache_capacity)
-
-        if cache_capacity > 0:
-            logger.info("Cache capacity is " + str(cache_capacity) + " bytes")
-
-        if shared.args.tensor_split is None or shared.args.tensor_split.strip() == '':
-            tensor_split_list = None
-        else:
-            tensor_split_list = [float(x) for x in shared.args.tensor_split.strip().split(",")]
-
-        params = {
-            'model_path': str(path),
-            'n_ctx': shared.args.n_ctx,
-            'n_threads': shared.args.threads or None,
-            'n_threads_batch': shared.args.threads_batch or None,
-            'n_batch': shared.args.n_batch,
-            'use_mmap': not shared.args.no_mmap,
-            'use_mlock': shared.args.mlock,
-            'mul_mat_q': not shared.args.no_mul_mat_q,
-            'numa': shared.args.numa,
-            'n_gpu_layers': shared.args.n_gpu_layers,
-            'rope_freq_base': shared.args.rope_freq_base,
-            'tensor_split': tensor_split_list,
-            'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
-            'offload_kqv': not shared.args.no_offload_kqv,
-            'split_mode': 1 if not shared.args.row_split else 2,
-            'flash_attn': shared.args.flash_attn
-        }
-
-        if shared.args.cache_type != 'fp16':
-            params["type_k"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
-            params["type_v"] = get_llamacpp_cache_type_for_string(shared.args.cache_type)
-
-        try:
-            result.model = Llama(**params)
-        except Exception as e:
-            error_message = (
-                f"Failed loading the model. **This usually happens due to lack of memory**. Try these steps:\n"
-                f"1. Reduce the context length `n_ctx` (currently {shared.args.n_ctx})."
-                f"{' Try a lower value like 4096.' if shared.args.n_ctx > 4096 else '.'}"
-                "\n"
-                f"2. Lower the `n-gpu-layers` value (currently {shared.args.n_gpu_layers})."
-            )
-
-            raise type(e)(error_message) from e
-
-        if cache_capacity > 0:
-            result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))
-
-        # This is ugly, but the model and the tokenizer are the same object in this library.
-        return result, result
-
-    def encode(self, string):
-        if type(string) is str:
-            string = string.encode()
-
-        return self.model.tokenize(string)
-
-    def decode(self, ids, **kwargs):
-        detokenized = self.model.detokenize(ids)
-        try:
-            # Attempt strict UTF-8 decoding first
-            return detokenized.decode('utf-8', 'strict')
-        except UnicodeDecodeError as e:
-            # Log the error and fall back to UTF-8 with replacement
-            logger.warning(f"Invalid UTF-8 in detokenized output. Using replacement characters.\n{e}")
-            return detokenized.decode('utf-8', 'replace')
-
-    def get_logits(self, tokens):
-        self.model.reset()
-        self.model.eval(tokens)
-        logits = self.model._scores
-        logits = np.expand_dims(logits, 0)  # batch dim is expected
-        return torch.tensor(logits, dtype=torch.float32)
-
-    def load_grammar(self, string):
-        if string != self.grammar_string:
-            self.grammar_string = string
-            if string.strip() != '':
-                self.grammar = llama_cpp_lib().LlamaGrammar.from_string(string)
-            else:
-                self.grammar = None
-
-    def generate(self, prompt, state, callback=None):
-        LogitsProcessorList = llama_cpp_lib().LogitsProcessorList
-        prompt = prompt if type(prompt) is str else prompt.decode()
-
-        # Handle truncation
-        prompt = self.encode(prompt)
-        prompt = prompt[-get_max_prompt_length(state):]
-        prompt = self.decode(prompt)
-
-        self.load_grammar(state['grammar_string'])
-        logit_processors = LogitsProcessorList()
-        if state['ban_eos_token']:
-            logit_processors.append(partial(ban_eos_logits_processor, self.model.token_eos()))
-
-        if state['custom_token_bans']:
-            to_ban = [int(x) for x in state['custom_token_bans'].split(',')]
-            if len(to_ban) > 0:
-                logit_processors.append(partial(custom_token_ban_logits_processor, to_ban))
-
-        completion_chunks = self.model.create_completion(
-            prompt=prompt,
-            max_tokens=state['max_new_tokens'],
-            temperature=state['temperature'],
-            top_p=state['top_p'] if state['top_p'] < 1 else 0.999,
-            min_p=state['min_p'],
-            typical_p=state['typical_p'],
-            frequency_penalty=state['frequency_penalty'],
-            presence_penalty=state['presence_penalty'],
-            repeat_penalty=state['repetition_penalty'],
-            top_k=state['top_k'],
-            stream=True,
-            seed=int(state['seed']) if state['seed'] != -1 else None,
-            tfs_z=state['tfs'],
-            mirostat_mode=int(state['mirostat_mode']),
-            mirostat_tau=state['mirostat_tau'],
-            mirostat_eta=state['mirostat_eta'],
-            logits_processor=logit_processors,
-            grammar=self.grammar
-        )
-
-        output = ""
-        for completion_chunk in completion_chunks:
-            if shared.stop_everything:
-                break
-
-            text = completion_chunk['choices'][0]['text']
-            output += text
-            if callback:
-                callback(text)
-
-        return output
-
-    def generate_with_streaming(self, *args, **kwargs):
-        with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
-            reply = ''
-            for token in generator:
-                reply += token
-                yield reply
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -30,51 +30,19 @@ loaders_and_params = OrderedDict({
        'n_gpu_layers',
        'threads',
        'threads_batch',
-        'n_batch',
+        'batch_size',
        'n_ctx',
        'cache_type',
        'tensor_split',
        'rope_freq_base',
        'compress_pos_emb',
-        'attention_sink_size',
-        'tensorcores',
        'flash_attn',
-        'streaming_llm',
-        'cpu',
        'row_split',
-        'no_offload_kqv',
-        'no_mul_mat_q',
+        'no_kv_offload',
        'no_mmap',
        'mlock',
        'numa',
    ],
-    'llamacpp_HF': [
-        'n_gpu_layers',
-        'threads',
-        'threads_batch',
-        'n_batch',
-        'n_ctx',
-        'cache_type',
-        'tensor_split',
-        'rope_freq_base',
-        'compress_pos_emb',
-        'attention_sink_size',
-        'tensorcores',
-        'flash_attn',
-        'streaming_llm',
-        'cpu',
-        'row_split',
-        'no_offload_kqv',
-        'no_mul_mat_q',
-        'no_mmap',
-        'mlock',
-        'numa',
-        'cfg_cache',
-        'logits_all',
-        'trust_remote_code',
-        'no_use_fast',
-        'llamacpp_HF_info',
-    ],
    'ExLlamav3_HF': [
        'max_seq_len',
        'gpu_split',
@ -307,66 +275,34 @@ loaders_samplers = {
        'dry_sequence_breakers',
    },
    'llama.cpp': {
-        'temperature',
-        'min_p',
-        'top_p',
-        'top_k',
-        'typical_p',
-        'tfs',
-        'repetition_penalty',
-        'frequency_penalty',
-        'presence_penalty',
-        'mirostat_mode',
-        'mirostat_tau',
-        'mirostat_eta',
-        'ban_eos_token',
-        'seed',
-        'custom_token_bans',
-        'grammar_string',
-        'grammar_file_row',
-    },
-    'llamacpp_HF': {
        'temperature',
        'dynatemp_low',
        'dynatemp_high',
        'dynatemp_exponent',
-        'smoothing_factor',
-        'smoothing_curve',
        'min_p',
        'top_p',
        'top_k',
        'typical_p',
        'xtc_threshold',
        'xtc_probability',
-        'epsilon_cutoff',
-        'eta_cutoff',
        'tfs',
-        'top_a',
-        'top_n_sigma',
        'dry_multiplier',
        'dry_allowed_length',
        'dry_base',
        'repetition_penalty',
        'frequency_penalty',
        'presence_penalty',
-        'encoder_repetition_penalty',
-        'no_repeat_ngram_size',
        'repetition_penalty_range',
-        'guidance_scale',
        'mirostat_mode',
        'mirostat_tau',
        'mirostat_eta',
-        'do_sample',
        'dynamic_temperature',
        'temperature_last',
        'auto_max_new_tokens',
        'ban_eos_token',
        'add_bos_token',
-        'skip_special_tokens',
        'seed',
        'sampler_priority',
-        'custom_token_bans',
-        'negative_prompt',
        'dry_sequence_breakers',
        'grammar_string',
        'grammar_file_row',
--- a/modules/logits.py
+++ b/modules/logits.py
@ -1,6 +1,7 @@
 import time
 import traceback

+import numpy as np
 import torch

 from modules import models, sampler_hijack, shared
@ -38,70 +39,84 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
        return 'Error: No model is loaded1 Select one in the Model tab.', previous

    is_non_hf_exllamav2 = shared.model.__class__.__name__ == 'Exllamav2Model'
-    is_non_hf_llamacpp = shared.model.__class__.__name__ == 'LlamaCppModel'
+    is_llamacpp = shared.model.__class__.__name__ == 'LlamaServer'

-    if use_samplers:
-        if any([is_non_hf_exllamav2, is_non_hf_llamacpp]):
-            logger.error("Sampler hijacking is not supported non-Huggingface loaders.")
-            # sampling is all done in c for exllama, so it is really hard to hijack
-            # it should be possible to hijack llamacpp sampler by hijacking all their sampling methods,
-            # but it is not implemented yet
-            return 'Error: Sampler hijacking is not supported non-Huggingface loaders. Please disable the "Use samplers" option.', previous
+    if is_llamacpp:
+        logprobs = shared.model.get_logits(prompt, state, n_probs=top_logits, use_samplers=use_samplers)
+        if return_dict:
+            output = {}
+            for entry in logprobs:
+                token = repr(entry['token'])
+                prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
+                output[token] = prob

-        state['max_new_tokens'] = 1
-        state['auto_max_new_tokens'] = False
-        for _ in generate_reply(prompt, state):
-            pass
-
-        scores = sampler_hijack.global_scores[-1]
-    else:
-        if is_non_hf_exllamav2:
-            device = get_device()
-            tokens = shared.tokenizer.encode(prompt)
-            if device:
-                tokens = tokens.to(device)
-
-            scores = shared.model.get_logits(tokens)[-1][-1]
-        elif is_non_hf_llamacpp:
-            tokens = shared.tokenizer.encode(prompt)
-            scores = shared.model.get_logits(tokens)[-1][-1]
+            return output
        else:
-            device = get_device()
-            tokens = shared.tokenizer.encode(prompt, return_tensors='pt')
-            if device:
-                tokens = tokens.to(device)
+            output = ''
+            for entry in logprobs:
+                token = repr(entry['token'])
+                prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
+                output += f"{prob:.5f}  -  {token}\n"

-            output = shared.model(input_ids=tokens)
-            scores = output['logits'][-1][-1]
-
-    probs = torch.softmax(scores, dim=-1, dtype=torch.float)
-    topk_values, topk_indices = torch.topk(probs, k=top_logits, largest=True, sorted=True)
-    if is_non_hf_llamacpp:
-        topk_indices = [i.expand((1, 1)) for i in topk_indices]
-
-    if hasattr(shared.tokenizer, 'convert_ids_to_tokens'):
-        tokens = [shared.tokenizer.convert_ids_to_tokens(int(i)) for i in topk_indices]
+            return output, previous
    else:
-        tokens = [shared.tokenizer.decode(i) for i in topk_indices]
+        if not use_samplers:
+            state = {'stream': True}

-    if return_dict:
-        topk_values = [float(i) for i in topk_values]
-        output = {}
-        for row in list(zip(topk_values, tokens)):
-            key = row[1]
-            if isinstance(key, bytes):
-                try:
-                    key = key.decode()
-                except:
-                    key = key.decode('latin')
+        if use_samplers:
+            if is_non_hf_exllamav2:
+                # sampling is all done in C++ for exllama, so it is really hard to hijack
+                logger.error("Sampler hijacking is not supported non-Huggingface loaders.")
+                return 'Error: Sampler hijacking is not supported non-Huggingface loaders. Please disable the "Use samplers" option.', previous

-            output[key] = row[0]
+            state['max_new_tokens'] = 1
+            state['auto_max_new_tokens'] = False
+            for _ in generate_reply(prompt, state):
+                pass

-        return output
-    else:
-        topk_values = [f"{float(i):.5f}" for i in topk_values]
-        output = ''
-        for row in list(zip(topk_values, tokens)):
-            output += f"{row[0]}  -  {repr(row[1])}\n"
+            scores = sampler_hijack.global_scores[-1]
+        else:
+            if is_non_hf_exllamav2:
+                device = get_device()
+                tokens = shared.tokenizer.encode(prompt)
+                if device:
+                    tokens = tokens.to(device)

-        return output, previous
+                scores = shared.model.get_logits(tokens)[-1][-1]
+            else:
+                device = get_device()
+                tokens = shared.tokenizer.encode(prompt, return_tensors='pt')
+                if device:
+                    tokens = tokens.to(device)
+
+                output = shared.model(input_ids=tokens)
+                scores = output['logits'][-1][-1]
+
+        probs = torch.softmax(scores, dim=-1, dtype=torch.float)
+        topk_values, topk_indices = torch.topk(probs, k=top_logits, largest=True, sorted=True)
+        if hasattr(shared.tokenizer, 'convert_ids_to_tokens'):
+            tokens = [shared.tokenizer.convert_ids_to_tokens(int(i)) for i in topk_indices]
+        else:
+            tokens = [shared.tokenizer.decode(i) for i in topk_indices]
+
+        if return_dict:
+            topk_values = [float(i) for i in topk_values]
+            output = {}
+            for row in list(zip(topk_values, tokens)):
+                key = row[1]
+                if isinstance(key, bytes):
+                    try:
+                        key = key.decode()
+                    except:
+                        key = key.decode('latin')
+
+                output[key] = row[0]
+
+            return output
+        else:
+            topk_values = [f"{float(i):.5f}" for i in topk_values]
+            output = ''
+            for row in list(zip(topk_values, tokens)):
+                output += f"{row[0]}  -  {repr(row[1])}\n"
+
+            return output, previous
--- a/modules/models.py
+++ b/modules/models.py
@ -67,8 +67,7 @@ def load_model(model_name, loader=None):
    shared.model_name = model_name
    load_func_map = {
        'Transformers': huggingface_loader,
-        'llama.cpp': llamacpp_loader,
-        'llamacpp_HF': llamacpp_HF_loader,
+        'llama.cpp': llama_cpp_server_loader,
        'ExLlamav3_HF': ExLlamav3_HF_loader,
        'ExLlamav2_HF': ExLlamav2_HF_loader,
        'ExLlamav2': ExLlamav2_loader,
@ -101,7 +100,7 @@ def load_model(model_name, loader=None):
    shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings})
    if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt'):
        shared.settings['truncation_length'] = shared.args.max_seq_len
-    elif loader in ['llama.cpp', 'llamacpp_HF']:
+    elif loader == 'llama.cpp':
        shared.settings['truncation_length'] = shared.args.n_ctx

    logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.")
@ -268,8 +267,8 @@ def huggingface_loader(model_name):
    return model


-def llamacpp_loader(model_name):
-    from modules.llamacpp_model import LlamaCppModel
+def llama_cpp_server_loader(model_name):
+    from modules.llama_cpp_server import LlamaServer

    path = Path(f'{shared.args.model_dir}/{model_name}')
    if path.is_file():
@ -278,31 +277,11 @@ def llamacpp_loader(model_name):
        model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]

    logger.info(f"llama.cpp weights detected: \"{model_file}\"")
-    model, tokenizer = LlamaCppModel.from_pretrained(model_file)
-    return model, tokenizer
-
-
-def llamacpp_HF_loader(model_name):
-    from modules.llamacpp_hf import LlamacppHF
-
-    if shared.args.tokenizer_dir:
-        logger.info(f'Using tokenizer from: \"{shared.args.tokenizer_dir}\"')
-    else:
-        path = Path(f'{shared.args.model_dir}/{model_name}')
-        # Check if a HF tokenizer is available for the model
-        if all((path / file).exists() for file in ['tokenizer_config.json']):
-            logger.info(f'Using tokenizer from: \"{path}\"')
-        else:
-            logger.error("Could not load the model because a tokenizer in Transformers format was not found.")
-            return None, None
-
-    model = LlamacppHF.from_pretrained(model_name)
-
-    if shared.args.tokenizer_dir:
-        tokenizer = load_tokenizer(model_name, tokenizer_dir=shared.args.tokenizer_dir)
-        return model, tokenizer
-    else:
-        return model
+    try:
+        model = LlamaServer(model_file)
+        return model, model
+    except Exception as e:
+        logger.error(f"Error loading the model with llama.cpp: {str(e)}")


 def ExLlamav3_HF_loader(model_name):
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -29,7 +29,7 @@ def get_model_metadata(model):
    # Get settings from models/config.yaml and models/config-user.yaml
    settings = shared.model_config
    for pat in settings:
-        if re.match(pat.lower(), model.lower()):
+        if re.match(pat.lower(), Path(model).name.lower()):
            for k in settings[pat]:
                model_settings[k] = settings[pat][k]

@ -40,10 +40,15 @@ def get_model_metadata(model):
        hf_metadata = None

    if 'loader' not in model_settings:
-        model_settings['loader'] = infer_loader(model, model_settings)
+        quant_method = None if hf_metadata is None else hf_metadata.get("quantization_config", {}).get("quant_method", None)
+        model_settings['loader'] = infer_loader(
+            model,
+            model_settings,
+            hf_quant_method=quant_method
+        )

    # GGUF metadata
-    if model_settings['loader'] in ['llama.cpp', 'llamacpp_HF']:
+    if model_settings['loader'] == 'llama.cpp':
        path = Path(f'{shared.args.model_dir}/{model}')
        if path.is_file():
            model_file = path
@ -143,7 +148,7 @@ def get_model_metadata(model):
    # Apply user settings from models/config-user.yaml
    settings = shared.user_config
    for pat in settings:
-        if re.match(pat.lower(), model.lower()):
+        if re.match(pat.lower(), Path(model).name.lower()):
            for k in settings[pat]:
                model_settings[k] = settings[pat][k]

@ -154,16 +159,18 @@ def get_model_metadata(model):
    return model_settings


-def infer_loader(model_name, model_settings):
+def infer_loader(model_name, model_settings, hf_quant_method=None):
    path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
    if not path_to_model.exists():
        loader = None
-    elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
-        loader = 'llamacpp_HF'
    elif len(list(path_to_model.glob('*.gguf'))) > 0:
        loader = 'llama.cpp'
    elif re.match(r'.*\.gguf', model_name.lower()):
        loader = 'llama.cpp'
+    elif hf_quant_method == 'exl3':
+        loader = 'ExLlamav3_HF'
+    elif hf_quant_method in ['exl2', 'gptq']:
+        loader = 'ExLlamav2_HF'
    elif re.match(r'.*exl3', model_name.lower()):
        loader = 'ExLlamav3_HF'
    elif re.match(r'.*exl2', model_name.lower()):
@ -245,7 +252,7 @@ def save_model_settings(model, state):
        return

    user_config = shared.load_user_config()
-    model_regex = model + '$'  # For exact matches
+    model_regex = Path(model).name + '$'  # For exact matches
    if model_regex not in user_config:
        user_config[model_regex] = {}

@ -272,7 +279,7 @@ def save_instruction_template(model, template):
        return

    user_config = shared.load_user_config()
-    model_regex = model + '$'  # For exact matches
+    model_regex = Path(model).name + '$'  # For exact matches
    if model_regex not in user_config:
        user_config[model_regex] = {}

--- a/modules/shared.py
+++ b/modules/shared.py
@ -86,7 +86,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft

 # Model loader
 group = parser.add_argument_group('Model loader')
-group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ, TensorRT-LLM.')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ, TensorRT-LLM.')

 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
@ -116,24 +116,17 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for
 # llama.cpp
 group = parser.add_argument_group('llama.cpp')
 group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.')
-group.add_argument('--tensorcores', action='store_true', help='NVIDIA only: use llama-cpp-python compiled without GGML_CUDA_FORCE_MMQ. This may improve performance on newer cards.')
 group.add_argument('--n_ctx', type=int, default=8192, help='Size of the prompt context.')
 group.add_argument('--threads', type=int, default=0, help='Number of threads to use.')
 group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.')
-group.add_argument('--no_mul_mat_q', action='store_true', help='Disable the mulmat kernels.')
-group.add_argument('--n_batch', type=int, default=512, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
+group.add_argument('--batch-size', type=int, default=2048, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
 group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
 group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
 group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
-group.add_argument('--tensor_split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
+group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
 group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
-group.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
-group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
-group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (llama-cpp-python). Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed.')
-group.add_argument('--row_split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
-group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
-group.add_argument('--attention-sink-size', type=int, default=5, help='StreamingLLM: number of sink tokens. Only used if the trimmed prompt does not share a prefix with the old prompt.')
-group.add_argument('--tokenizer-dir', type=str, help='Load the tokenizer from this folder. Meant to be used with llamacpp_HF through the command-line.')
+group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
+group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')

 # ExLlamaV2
 group = parser.add_argument_group('ExLlamaV2')
@ -197,24 +190,8 @@ group.add_argument('--api-enable-ipv6', action='store_true', help='Enable IPv6 f
 group.add_argument('--api-disable-ipv4', action='store_true', help='Disable IPv4 for the API')
 group.add_argument('--nowebui', action='store_true', help='Do not launch the Gradio UI. Useful for launching the API in standalone mode.')

-# Multimodal
-group = parser.add_argument_group('Multimodal')
-group.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.')
-
 # Deprecated parameters
 group = parser.add_argument_group('Deprecated')
-group.add_argument('--cache_4bit', action='store_true', help='DEPRECATED')
-group.add_argument('--cache_8bit', action='store_true', help='DEPRECATED')
-group.add_argument('--chat-buttons', action='store_true', help='DEPRECATED')
-group.add_argument('--triton', action='store_true', help='DEPRECATED')
-group.add_argument('--no_inject_fused_mlp', action='store_true', help='DEPRECATED')
-group.add_argument('--no_use_cuda_fp16', action='store_true', help='DEPRECATED')
-group.add_argument('--desc_act', action='store_true', help='DEPRECATED')
-group.add_argument('--disable_exllama', action='store_true', help='DEPRECATED')
-group.add_argument('--disable_exllamav2', action='store_true', help='DEPRECATED')
-group.add_argument('--wbits', type=int, default=0, help='DEPRECATED')
-group.add_argument('--groupsize', type=int, default=-1, help='DEPRECATED')
-group.add_argument('--model-menu', action='store_true', help='DEPRECATED')

 args = parser.parse_args()
 args_defaults = parser.parse_args([])
@ -224,28 +201,8 @@ for arg in sys.argv[1:]:
    if hasattr(args, arg):
        provided_arguments.append(arg)

-deprecated_args = [
-    'cache_4bit',
-    'cache_8bit',
-    'chat_buttons',
-    'triton',
-    'no_inject_fused_mlp',
-    'no_use_cuda_fp16',
-    'desc_act',
-    'disable_exllama',
-    'disable_exllamav2',
-    'wbits',
-    'groupsize'
-]
-

 def do_cmd_flags_warnings():
-
-    # Deprecation warnings
-    for k in deprecated_args:
-        if k in provided_arguments:
-            logger.warning(f'The --{k} flag has been deprecated and will be removed soon. Please remove that flag.')
-
    # Security warnings
    if args.trust_remote_code:
        logger.warning('trust_remote_code is enabled. This is dangerous.')
@ -263,10 +220,8 @@ def fix_loader_name(name):
        return name

    name = name.lower()
-    if name in ['llamacpp', 'llama.cpp', 'llama-cpp', 'llama cpp']:
+    if name in ['llama.cpp', 'llamacpp', 'llama-cpp', 'llama cpp']:
        return 'llama.cpp'
-    if name in ['llamacpp_hf', 'llama.cpp_hf', 'llama-cpp-hf', 'llamacpp-hf', 'llama.cpp-hf']:
-        return 'llamacpp_HF'
    elif name in ['transformers', 'huggingface', 'hf', 'hugging_face', 'hugging face']:
        return 'Transformers'
    elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']:
@ -281,58 +236,6 @@ def fix_loader_name(name):
        return 'TensorRT-LLM'


-def transform_legacy_kv_cache_options(opts):
-    # Handle both argparse.Namespace and dict here
-    def get(key):
-        return opts.get(key) if isinstance(opts, dict) else getattr(opts, key, None)
-
-    def set(key, value):
-        if isinstance(opts, dict):
-            opts[key] = value
-        else:
-            setattr(opts, key, value)
-
-    def del_key(key, fallback_set):
-        # only remove from user dict, can't delete from argparse.Namespace
-        if type(opts) is dict:
-            if key in opts:
-                del opts[key]
-        else:
-            setattr(opts, key, fallback_set)
-
-    # Retrieve values
-    loader = get('loader')
-    cache_8bit = get('cache_8bit')
-    cache_4bit = get('cache_4bit')
-
-    # Determine cache type based on loader or legacy flags
-    if cache_8bit or cache_4bit:
-        if not loader:
-            # Legacy behavior: prefer 8-bit over 4-bit to minimize breakage
-            if cache_8bit:
-                set('cache_type', 'fp8')
-            elif cache_4bit:
-                set('cache_type', 'q4')
-        elif loader.lower() in ['exllamav2', 'exllamav2_hf']:
-            # ExLlamaV2 loader-specific cache type
-            if cache_8bit:
-                set('cache_type', 'fp8')
-            elif cache_4bit:
-                set('cache_type', 'q4')
-        elif loader.lower() in ['llama.cpp', 'llamacpp_hf']:
-            # Llama.cpp loader-specific cache type
-            if cache_4bit:
-                set('cache_type', 'q4_0')
-            elif cache_8bit:
-                set('cache_type', 'q8_0')
-
-    # Clean up legacy keys
-    del_key('cache_4bit', False)
-    del_key('cache_8bit', False)
-
-    return opts
-
-
 def add_extension(name, last=False):
    if args.extensions is None:
        args.extensions = [name]
@ -361,18 +264,10 @@ def load_user_config():
    else:
        user_config = {}

-    for model_name in user_config:
-        user_config[model_name] = transform_legacy_kv_cache_options(user_config[model_name])
-
    return user_config


 args.loader = fix_loader_name(args.loader)
-args = transform_legacy_kv_cache_options(args)
-
-# Activate the multimodal extension
-if args.multimodal_pipeline is not None:
-    add_extension('multimodal')

 # Activate the API extension
 if args.api or args.public_api:
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -17,7 +17,6 @@ from transformers import (

 import modules.shared as shared
 from modules import models, sampler_hijack
-from modules.cache_utils import process_llamacpp_cache
 from modules.callbacks import (
    Iteratorize,
    Stream,
@ -56,7 +55,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
            yield ''
            return

-        if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel']:
+        if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel']:
            generate_func = generate_reply_custom
        else:
            generate_func = generate_reply_HF
@ -133,8 +132,12 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
    if shared.tokenizer is None:
        raise ValueError('No tokenizer is loaded')

-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel']:
-        input_ids = shared.tokenizer.encode(str(prompt))
+    if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel']:
+        if shared.model.__class__.__name__ == 'LlamaServer':
+            input_ids = shared.tokenizer.encode(str(prompt), add_bos_token=add_bos_token)
+        else:
+            input_ids = shared.tokenizer.encode(str(prompt))
+
        if shared.model.__class__.__name__ not in ['Exllamav2Model']:
            input_ids = np.array(input_ids).reshape(1, len(input_ids))
    else:
@ -159,7 +162,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
    if truncation_length is not None:
        input_ids = input_ids[:, -truncation_length:]

-    if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
+    if shared.model.__class__.__name__ in ['LlamaServer', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu:
        return input_ids
    else:
        device = get_device()
@ -186,7 +189,7 @@ def get_encoded_length(prompt):

 def get_token_ids(prompt):
    tokens = encode(prompt)[0]
-    decoded_tokens = [shared.tokenizer.decode([i]) for i in tokens]
+    decoded_tokens = [shared.tokenizer.decode([int(i)]) for i in tokens]

    output = ''
    for row in list(zip(tokens, decoded_tokens)):
@ -401,12 +404,6 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
        logger.info("PROMPT=")
        print_prompt(decode(input_ids[0], skip_special_tokens=False))

-    # Handle StreamingLLM for llamacpp_HF
-    if shared.model.__class__.__name__ == 'LlamacppHF' and shared.args.streaming_llm:
-        tmp = process_llamacpp_cache(shared.model.model, input_ids[-1].tolist(), shared.model.model._input_ids.tolist())
-        shared.model.past_seq = torch.tensor(tmp)
-        shared.model.save_cache()
-
    t0 = time.time()
    try:
        if not is_chat and not shared.is_seq2seq:
--- a/modules/ui.py
+++ b/modules/ui.py
@ -110,7 +110,7 @@ def list_model_elements():
        'n_gpu_layers',
        'threads',
        'threads_batch',
-        'n_batch',
+        'batch_size',
        'hqq_backend',
        'n_ctx',
        'max_seq_len',
@ -122,21 +122,17 @@ def list_model_elements():
        'compress_pos_emb',
        'compute_dtype',
        'quant_type',
-        'attention_sink_size',
        'num_experts_per_token',
-        'tensorcores',
        'load_in_8bit',
        'load_in_4bit',
        'torch_compile',
        'flash_attn',
        'use_flash_attention_2',
-        'streaming_llm',
        'auto_devices',
        'cpu',
        'disk',
        'row_split',
-        'no_offload_kqv',
-        'no_mul_mat_q',
+        'no_kv_offload',
        'no_mmap',
        'mlock',
        'numa',
@ -150,7 +146,6 @@ def list_model_elements():
        'no_sdpa',
        'cfg_cache',
        'cpp_runner',
-        'logits_all',
        'trust_remote_code',
        'no_use_fast',
    ]
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@ -46,8 +46,8 @@ def create_ui():

        with gr.Row():
            with gr.Column(elem_id='chat-col'):
-                shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', ''), visible=True)
-                shared.gradio['display'] = gr.Textbox(value="", visible=False)  # Hidden buffer
+                shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
+                shared.gradio['display'] = gr.JSON(value={}, visible=False)  # Hidden buffer
                with gr.Row(elem_id="chat-input-row"):
                    with gr.Column(scale=1, elem_id='gr-hover-container'):
                        gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')
@ -181,7 +181,7 @@ def create_event_handlers():
    shared.reload_inputs = gradio(reload_arr)

    # Morph HTML updates instead of updating everything
-    shared.gradio['display'].change(None, gradio('display'), None, js="(text) => handleMorphdomUpdate(text)")
+    shared.gradio['display'].change(None, gradio('display'), None, js="(data) => handleMorphdomUpdate(data.html)")

    shared.gradio['Generate'].click(
        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -87,7 +87,7 @@ def create_ui():
                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
                            shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
                            shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
-                            shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch)
+                            shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
                            shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
                            shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768.')
                            shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768.')
@ -99,23 +99,19 @@ def create_ui():
                            shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
                            shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
                            shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
-                            shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, precision=0, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.')
                            shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')

                        with gr.Column():
-                            shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled without GGML_CUDA_FORCE_MMQ. This may improve performance on newer cards.')
                            shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                            shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
                            shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                            shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
-                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                            shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices)
                            shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
                            shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
                            shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
-                            shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
-                            shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.')
+                            shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
                            shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
                            shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
                            shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
@ -129,10 +125,8 @@ def create_ui():
                            shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
                            shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
                            shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
-                            shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work with this loader. Otherwise, ignore it, as it makes prompt processing slower.')
                            shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
                            shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
-                            shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.")
                            shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
                            shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')

@ -147,15 +141,6 @@ def create_ui():
                        shared.gradio['download_model_button'] = gr.Button("Download", variant='primary', interactive=not mu)
                        shared.gradio['get_file_list'] = gr.Button("Get file list", interactive=not mu)

-                with gr.Tab("llamacpp_HF creator"):
-                    with gr.Row():
-                        shared.gradio['gguf_menu'] = gr.Dropdown(choices=utils.get_available_ggufs(), value=lambda: shared.model_name, label='Choose your GGUF', elem_classes='slim-dropdown', interactive=not mu)
-                        ui.create_refresh_button(shared.gradio['gguf_menu'], lambda: None, lambda: {'choices': utils.get_available_ggufs()}, 'refresh-button', interactive=not mu)
-
-                    shared.gradio['unquantized_url'] = gr.Textbox(label="Enter the URL for the original (unquantized) model", info="Example: https://huggingface.co/lmsys/vicuna-13b-v1.5", max_lines=1)
-                    shared.gradio['create_llamacpp_hf_button'] = gr.Button("Submit", variant="primary", interactive=not mu)
-                    gr.Markdown("This will move your gguf file into a subfolder of `models` along with the necessary tokenizer files.")
-
                with gr.Tab("Customize instruction template"):
                    with gr.Row():
                        shared.gradio['customized_template'] = gr.Dropdown(choices=utils.get_available_instruction_templates(), value='None', label='Select the desired instruction template', elem_classes='slim-dropdown')
@ -195,7 +180,6 @@ def create_event_handlers():
    shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
    shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
    shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))
-    shared.gradio['create_llamacpp_hf_button'].click(create_llamacpp_hf, gradio('gguf_menu', 'unquantized_url'), gradio('model_status'), show_progress=True)
    shared.gradio['customized_template_submit'].click(save_instruction_template, gradio('model_menu', 'customized_template'), gradio('model_status'), show_progress=True)


@ -286,34 +270,11 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
        yield traceback.format_exc().replace('\n', '\n\n')


-def create_llamacpp_hf(gguf_name, unquantized_url, progress=gr.Progress()):
-    try:
-        downloader = importlib.import_module("download-model").ModelDownloader()
-
-        progress(0.0)
-        model, branch = downloader.sanitize_model_and_branch_names(unquantized_url, None)
-
-        yield ("Getting the tokenizer files links from Hugging Face")
-        links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=True)
-        output_folder = Path(shared.args.model_dir) / (re.sub(r'(?i)\.gguf$', '', gguf_name) + "-HF")
-
-        yield (f"Downloading tokenizer to `{output_folder}/`")
-        downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=4, is_llamacpp=False)
-
-        # Move the GGUF
-        (Path(shared.args.model_dir) / gguf_name).rename(output_folder / gguf_name)
-
-        yield (f"Model saved to `{output_folder}/`.\n\nYou can now load it using llamacpp_HF.")
-    except:
-        progress(1.0)
-        yield traceback.format_exc().replace('\n', '\n\n')
-
-
 def update_truncation_length(current_length, state):
    if 'loader' in state:
        if state['loader'].lower().startswith('exllama'):
            return state['max_seq_len']
-        elif state['loader'] in ['llama.cpp', 'llamacpp_HF']:
+        elif state['loader'] == 'llama.cpp':
            return state['n_ctx']

    return current_length
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@ -69,10 +69,8 @@ def set_interface_arguments(extensions, bool_active):


 def get_boolean_arguments(active=False):
-    exclude = shared.deprecated_args
-
    cmd_list = vars(shared.args)
-    bool_list = sorted([k for k in cmd_list if type(cmd_list[k]) is bool and k not in exclude + ui.list_model_elements()])
+    bool_list = sorted([k for k in cmd_list if type(cmd_list[k]) is bool and k not in ui.list_model_elements()])
    bool_active = [k for k in bool_list if vars(shared.args)[k]]

    if active:
--- a/modules/utils.py
+++ b/modules/utils.py
@ -73,21 +73,61 @@ def natural_keys(text):


 def get_available_models():
-    model_list = []
-    for item in list(Path(f'{shared.args.model_dir}/').glob('*')):
-        if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml', '.py')) and 'llama-tokenizer' not in item.name:
-            model_list.append(item.name)
+    # Get all GGUF files
+    gguf_files = get_available_ggufs()

-    return ['None'] + sorted(model_list, key=natural_keys)
+    model_dir = Path(shared.args.model_dir)
+
+    # Find top-level directories containing GGUF files
+    dirs_with_gguf = set()
+    for gguf_path in gguf_files:
+        path = Path(gguf_path)
+        if path.parts:  # If in a subdirectory
+            dirs_with_gguf.add(path.parts[0])  # Add top-level directory
+
+    # Find directories with safetensors files directly under them
+    dirs_with_safetensors = set()
+    for item in os.listdir(model_dir):
+        item_path = model_dir / item
+        if item_path.is_dir():
+            # Check if there are safetensors files directly under this directory
+            if any(file.lower().endswith(('.safetensors', '.pt')) for file in os.listdir(item_path) if (item_path / file).is_file()):
+                dirs_with_safetensors.add(item)
+
+    # Find valid model directories
+    model_dirs = []
+
+    for item in os.listdir(model_dir):
+        item_path = model_dir / item
+
+        # Skip if not a directory
+        if not item_path.is_dir():
+            continue
+
+        # Include directory if it either:
+        # 1. Doesn't contain GGUF files, OR
+        # 2. Contains both GGUF and safetensors files
+        if item not in dirs_with_gguf or item in dirs_with_safetensors:
+            model_dirs.append(item)
+
+    model_dirs = sorted(model_dirs, key=natural_keys)
+
+    # Combine all models
+    return ['None'] + gguf_files + model_dirs


 def get_available_ggufs():
    model_list = []
-    for item in Path(f'{shared.args.model_dir}/').glob('*'):
-        if item.is_file() and item.name.lower().endswith(".gguf"):
-            model_list.append(item.name)
+    model_dir = Path(shared.args.model_dir)

-    return ['None'] + sorted(model_list, key=natural_keys)
+    for dirpath, _, files in os.walk(model_dir, followlinks=True):
+        for file in files:
+            if file.lower().endswith(".gguf"):
+                model_path = Path(dirpath) / file
+                rel_path = model_path.relative_to(model_dir)
+                model_list.append(str(rel_path))
+
+    return sorted(model_list, key=natural_keys)


 def get_available_presets():
--- a/one_click.py
+++ b/one_click.py
@ -26,6 +26,7 @@ LIBSTDCXX_VERSION_LINUX = "12.1.0"
 # Environment
 script_dir = os.getcwd()
 conda_env_path = os.path.join(script_dir, "installer_files", "env")
+state_file = '.installer_state.json'

 # Command-line flags
 cmd_flags_path = os.path.join(script_dir, "CMD_FLAGS.txt")
@ -238,6 +239,9 @@ def get_user_choice(question, options_dict):


 def install_webui():
+    if os.path.isfile(state_file):
+        os.remove(state_file)
+
    # Ask the user for the GPU vendor
    if "GPU_CHOICE" in os.environ:
        choice = os.environ["GPU_CHOICE"].upper()
@ -372,7 +376,6 @@ def update_requirements(initial_installation=False, pull=True):
        requirements_file = "requirements" + ("_noavx2" if not cpu_has_avx2() else "") + ".txt"

    # Load state from JSON file
-    state_file = '.installer_state.json'
    current_commit = get_current_commit()
    wheels_changed = False
    if os.path.exists(state_file):
--- a/requirements.txt
+++ b/requirements.txt
@ -31,21 +31,11 @@ flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken

-# llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-
-# llama-cpp-python (CUDA, with GGML_CUDA_FORCE_MMQ)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-
-# llama-cpp-python (CUDA, without GGML_CUDA_FORCE_MMQ)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-
 # CUDA wheels
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1/exllamav3-0.0.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1/exllamav3-0.0.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@ -30,11 +30,7 @@ flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken

-# llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.8+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/rocm/llama_cpp_binaries-0.2.0+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@ -30,10 +30,7 @@ flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken

-# llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-
 # AMD wheels
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/rocm/llama_cpp_binaries-0.2.0+rocm6.1.2avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@ -31,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1/exllamav3-0.0.1-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@ -31,8 +31,8 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1/exllamav3-0.0.1-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/metal/llama_cpp_binaries-0.2.0-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3-py3-none-any.whl
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@ -30,6 +30,6 @@ flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken

-# llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+# llama.cpp (CPU only, AVX2)
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@ -30,6 +30,6 @@ flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken

-# llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+# llama.cpp (CPU only, no AVX2)
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/cpu/llama_cpp_binaries-0.2.0+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@ -31,21 +31,11 @@ flask_cloudflared==0.0.14
 sse-starlette==1.6.5
 tiktoken

-# llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-
-# llama-cpp-python (CUDA, with GGML_CUDA_FORCE_MMQ)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-
-# llama-cpp-python (CUDA, without GGML_CUDA_FORCE_MMQ)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-
 # CUDA wheels
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1/exllamav3-0.0.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1/exllamav3-0.0.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/textgen-webui/llama_cpp_binaries-0.2.0+cu124avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a3/exllamav3-0.0.1a3+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"