Merge pull request #6939 from oobabooga/dev

Merge dev branch
2025-06-07 14:17:09 -04:00 · 2025-05-01 00:15:11 -03:00 · 2025-05-01 00:15:11 -03:00 · a41da1ec95
commit a41da1ec95
parent 6e6f9971a2 b950a0c6db
35 changed files with 210 additions and 111 deletions
--- a/README.md
+++ b/README.md
@ -36,7 +36,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases
 #### Option 2: One-click installer

 1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip).
-2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat`.
+2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, or `start_macos.sh`.
 3) Select your GPU vendor when asked.
 4) After installation completes, browse to `http://localhost:7860`.
 5) Have fun!
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@ -2,6 +2,7 @@ import asyncio
 import json
 import logging
 import os
+import socket
 import traceback
 from collections import deque
 from threading import Thread
@ -374,9 +375,26 @@ async def handle_unload_loras():
    return JSONResponse(content="OK")


+def find_available_port(starting_port):
+    """Try the starting port, then find an available one if it's taken."""
+    try:
+        # Try to create a socket with the starting port
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(('', starting_port))
+            return starting_port
+    except OSError:
+        # Port is already in use, so find a new one
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(('', 0))  # Bind to port 0 to get an available port
+            new_port = s.getsockname()[1]
+            logger.warning(f"Port {starting_port} is already in use. Using port {new_port} instead.")
+            return new_port
+
+
 def run_server():
    # Parse configuration
    port = int(os.environ.get('OPENEDAI_PORT', shared.args.api_port))
+    port = find_available_port(port)
    ssl_certfile = os.environ.get('OPENEDAI_CERT_PATH', shared.args.ssl_certfile)
    ssl_keyfile = os.environ.get('OPENEDAI_KEY_PATH', shared.args.ssl_keyfile)

--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@ -42,6 +42,7 @@ class GenerationOptions(BaseModel):
    auto_max_new_tokens: bool = False
    ban_eos_token: bool = False
    add_bos_token: bool = True
+    enable_thinking: bool = True
    skip_special_tokens: bool = True
    static_cache: bool = False
    truncation_length: int = 0
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@ -31,13 +31,13 @@ function removeLastClick() {
 }

 function handleMorphdomUpdate(text) {
-  // Track closed blocks
-  const closedBlocks = new Set();
+  // Track open blocks
+  const openBlocks = new Set();
  document.querySelectorAll(".thinking-block").forEach(block => {
    const blockId = block.getAttribute("data-block-id");
-    // If block exists and is not open, add to closed set
-    if (blockId && !block.hasAttribute("open")) {
-      closedBlocks.add(blockId);
+    // If block exists and is open, add to open set
+    if (blockId && block.hasAttribute("open")) {
+      openBlocks.add(blockId);
    }
  });

@ -72,13 +72,15 @@ function handleMorphdomUpdate(text) {
          }
        }

-        // For thinking blocks, respect closed state
+        // For thinking blocks, assume closed by default
        if (fromEl.classList && fromEl.classList.contains("thinking-block") &&
           toEl.classList && toEl.classList.contains("thinking-block")) {
          const blockId = toEl.getAttribute("data-block-id");
-          // If this block was closed by user, keep it closed
-          if (blockId && closedBlocks.has(blockId)) {
+          // Remove open attribute by default
          toEl.removeAttribute("open");
+          // If this block was explicitly opened by user, keep it open
+          if (blockId && openBlocks.has(blockId)) {
+            toEl.setAttribute("open", "");
          }
        }

--- a/modules/chat.py
+++ b/modules/chat.py
@ -90,6 +90,44 @@ def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=Tru
    return prefix, suffix


+def get_thinking_suppression_string(template):
+    """
+    Determines what string needs to be added to suppress thinking mode
+    by comparing template renderings with thinking enabled vs disabled.
+    """
+
+    # Render with thinking enabled
+    with_thinking = template.render(
+        messages=[{'role': 'user', 'content': ''}],
+        builtin_tools=None,
+        tools=None,
+        tools_in_user_message=False,
+        add_generation_prompt=True,
+        enable_thinking=True
+    )
+
+    # Render with thinking disabled
+    without_thinking = template.render(
+        messages=[{'role': 'user', 'content': ''}],
+        builtin_tools=None,
+        tools=None,
+        tools_in_user_message=False,
+        add_generation_prompt=True,
+        enable_thinking=False
+    )
+
+    # Find the difference (what gets added to suppress thinking)
+    i = 0
+    while i < min(len(with_thinking), len(without_thinking)) and with_thinking[i] == without_thinking[i]:
+        i += 1
+
+    j = 0
+    while j < min(len(with_thinking), len(without_thinking)) - i and with_thinking[-1 - j] == without_thinking[-1 - j]:
+        j += 1
+
+    return without_thinking[i:len(without_thinking) - j if j else None]
+
+
 def generate_chat_prompt(user_input, state, **kwargs):
    impersonate = kwargs.get('impersonate', False)
    _continue = kwargs.get('_continue', False)
@ -147,13 +185,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
    if user_input and not impersonate and not _continue:
        messages.append({"role": "user", "content": user_input})

-    def remove_extra_bos(prompt):
-        for bos_token in ['<s>', '<|startoftext|>', '<BOS_TOKEN>', '<|endoftext|>']:
-            while prompt.startswith(bos_token):
-                prompt = prompt[len(bos_token):]
-
-        return prompt
-
    def make_prompt(messages):
        if state['mode'] == 'chat-instruct' and _continue:
            prompt = renderer(messages=messages[:-1])
@ -165,7 +196,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
            if state['custom_system_message'].strip() != '':
                outer_messages.append({"role": "system", "content": state['custom_system_message']})

-            prompt = remove_extra_bos(prompt)
            command = state['chat-instruct_command']
            command = command.replace('<|character|>', state['name2'] if not impersonate else state['name1'])
            command = command.replace('<|prompt|>', prompt)
@ -182,11 +212,10 @@ def generate_chat_prompt(user_input, state, **kwargs):
            outer_messages.append({"role": "user", "content": command})
            outer_messages.append({"role": "assistant", "content": prefix})

-            prompt = instruction_template.render(messages=outer_messages)
+            prompt = instruct_renderer(messages=outer_messages)
            suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1]
            if len(suffix) > 0:
                prompt = prompt[:-len(suffix)]
-
        else:
            if _continue:
                suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
@ -199,7 +228,9 @@ def generate_chat_prompt(user_input, state, **kwargs):

                prompt += prefix

-        prompt = remove_extra_bos(prompt)
+        if state['mode'] == 'instruct' and not any((_continue, impersonate, state['enable_thinking'])):
+            prompt += get_thinking_suppression_string(instruction_template)
+
        return prompt

    prompt = make_prompt(messages)
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@ -65,7 +65,7 @@ class Exllamav2HF(PreTrainedModel, GenerationMixin):
        elif kv_cache_type == 'q4':
            cache_type = ExLlamaV2Cache_Q4
        else:
-            raise ValueError(f"Invalid cache type for ExLlamaV2: {cache_type}. Valid options are: fp16, fp8, q8, q6, q4.")
+            raise ValueError(f"Invalid cache type for ExLlamaV2: {kv_cache_type}. Valid options are: fp16, fp8, q8, q6, q4.")

        # Use TP if specified
        if shared.args.enable_tp:
@ -78,12 +78,10 @@ class Exllamav2HF(PreTrainedModel, GenerationMixin):

        self.past_seq = None
        if shared.args.cfg_cache:
-            if shared.args.cache_8bit:
-                self.ex_cache_negative = ExLlamaV2Cache_8bit(self.ex_model)
-            elif shared.args.cache_4bit:
-                self.ex_cache_negative = ExLlamaV2Cache_Q4(self.ex_model)
+            if shared.args.enable_tp:
+                self.ex_cache_negative = ExLlamaV2Cache_TP(self.ex_model, base=cache_type)
            else:
-                self.ex_cache_negative = ExLlamaV2Cache(self.ex_model)
+                self.ex_cache_negative = cache_type(self.ex_model, lazy=shared.args.autosplit)

            self.past_seq_negative = None

--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@ -118,6 +118,9 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
        seq_tensor = torch.tensor(seq)
        reset = True

+        # Maximum number of tokens to process in a single forward pass
+        max_chunk_size = 2048
+
        # Make the forward call
        if labels is None:
            if past_seq is not None:
@ -131,55 +134,85 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
                if longest_prefix > 0:
                    reset = False
                    current_len = longest_prefix
-                    if len(seq_tensor) - longest_prefix > 1:
+                    remaining_tokens = len(seq_tensor) - longest_prefix - 1
+
+                    if remaining_tokens > 0:
+                        # Process tokens from longest_prefix to second-to-last token
+                        tokens_to_process = seq_tensor[longest_prefix:-1]
+
+                        # Process in chunks if the number of tokens is large
+                        for i in range(0, tokens_to_process.shape[0], max_chunk_size):
+                            chunk = tokens_to_process[i:i + max_chunk_size]
                            self.ex_model.forward(
-                            input_ids=seq_tensor[longest_prefix:-1].view(1, -1),
+                                input_ids=chunk.view(1, -1),
                                params={
                                    "attn_mode": "flash_attn",
                                    "cache": ex_cache,
-                                "past_len": longest_prefix,
-                                "batch_shape": (1, self.max_tokens)
+                                    "past_len": longest_prefix + i,
+                                    "batch_shape": (1, self.max_tokens),
+                                    "reconstruct": False  # Force memory-efficient path
                                }
                            )

-                        current_len = longest_prefix + len(seq_tensor) - longest_prefix - 1
+                        current_len = longest_prefix + remaining_tokens

            if reset:
                if len(seq_tensor) > 1:
+                    # Process all tokens except the last one
+                    tokens_to_process = seq_tensor[:-1]
+
+                    # Process in chunks if the number of tokens is large
+                    current_len = 0
+                    for i in range(0, tokens_to_process.shape[0], max_chunk_size):
+                        chunk = tokens_to_process[i:i + max_chunk_size]
                        self.ex_model.forward(
-                        input_ids=seq_tensor[:-1].view(1, -1),
+                            input_ids=chunk.view(1, -1),
                            params={
                                "attn_mode": "flash_attn",
                                "cache": ex_cache,
-                            "past_len": 0,
-                            "batch_shape": (1, self.max_tokens)
+                                "past_len": current_len,
+                                "batch_shape": (1, self.max_tokens),
+                                "reconstruct": False  # Force memory-efficient path
                            }
                        )
-
-                    current_len = len(seq_tensor) - 1
+                        current_len += chunk.shape[0]
                else:
                    current_len = 0

+            # Process the last token and get logits
            logits = self.ex_model.forward(
                input_ids=seq_tensor[-1:].view(1, -1),
                params={
                    "attn_mode": "flash_attn",
                    "cache": ex_cache,
                    "past_len": current_len,
-                    "batch_shape": (1, self.max_tokens)
+                    "batch_shape": (1, self.max_tokens),
+                    "reconstruct": False  # Force memory-efficient path
                }
            ).to(input_ids.device).float()
        else:
-            logits = self.ex_model.forward(
-                input_ids=seq_tensor.view(1, -1),
+            # When processing with labels, handle as a complete sequence
+            # Process in chunks if the number of tokens is large
+            tokens_to_process = seq_tensor
+            all_logits = None
+
+            for i in range(0, tokens_to_process.shape[0], max_chunk_size):
+                chunk = tokens_to_process[i:i + max_chunk_size]
+                chunk_logits = self.ex_model.forward(
+                    input_ids=chunk.view(1, -1),
                    params={
-                    "attn_mode": "flash_attn",
-                    "cache": ex_cache,
-                    "past_len": 0,
-                    "batch_shape": (1, self.max_tokens)
+                        "attn_mode": "flash_attn_nc",  # No caching for training
+                        "reconstruct": False  # Force memory-efficient path
                    }
                ).float()

+                if all_logits is None:
+                    all_logits = chunk_logits
+                else:
+                    all_logits = torch.cat([all_logits, chunk_logits], dim=1)
+
+            logits = all_logits
+
        if is_negative:
            self.past_seq_negative = seq_tensor
        else:
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@ -167,7 +167,7 @@ def convert_to_markdown(string, message_id=None):
        title_text = "Thinking..." if is_streaming else "Thought"

        thinking_block = f'''
-        <details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}" open>
+        <details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
            <summary class="thinking-header">
                <svg class="thinking-icon" width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
                    <path d="M8 1.33334C4.31868 1.33334 1.33334 4.31868 1.33334 8.00001C1.33334 11.6813 4.31868 14.6667 8 14.6667C11.6813 14.6667 14.6667 11.6813 14.6667 8.00001C14.6667 4.31868 11.6813 1.33334 8 1.33334Z" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -135,6 +135,7 @@ class LlamaServer:
            "prompt": token_ids,
            "n_predict": max_new_tokens,
            "stream": True,
+            "cache_prompt": True
        })

        if shared.args.verbose:
@ -327,6 +328,11 @@ class LlamaServer:
            else:
                env['LD_LIBRARY_PATH'] = os.path.dirname(self.server_path)

+        if shared.args.verbose:
+            logger.info("llama-server command-line flags:")
+            print(' '.join(str(item) for item in cmd[1:]))
+            print()
+
        # Start the server with pipes for output
        self.process = subprocess.Popen(
            cmd,
@ -340,9 +346,7 @@ class LlamaServer:

        # Wait for server to be healthy
        health_url = f"http://127.0.0.1:{self.port}/health"
-        start_time = time.time()
-        timeout = 3600 * 8  # 8 hours
-        while time.time() - start_time < timeout:
+        while True:
            # Check if process is still alive
            if self.process.poll() is not None:
                # Process has terminated
@ -357,8 +361,6 @@ class LlamaServer:
                pass

            time.sleep(1)
-        else:
-            raise TimeoutError(f"Server health check timed out after {timeout} seconds")

        # Server is now healthy, get model info
        self._get_vocabulary_size()
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -143,6 +143,7 @@ def transformers_samplers():
        'auto_max_new_tokens',
        'ban_eos_token',
        'add_bos_token',
+        'enable_thinking',
        'skip_special_tokens',
        'static_cache',
        'seed',
@ -195,6 +196,7 @@ loaders_samplers = {
        'auto_max_new_tokens',
        'ban_eos_token',
        'add_bos_token',
+        'enable_thinking',
        'skip_special_tokens',
        'seed',
        'sampler_priority',
@ -241,6 +243,7 @@ loaders_samplers = {
        'auto_max_new_tokens',
        'ban_eos_token',
        'add_bos_token',
+        'enable_thinking',
        'skip_special_tokens',
        'seed',
        'sampler_priority',
@ -279,6 +282,7 @@ loaders_samplers = {
        'auto_max_new_tokens',
        'ban_eos_token',
        'add_bos_token',
+        'enable_thinking',
        'skip_special_tokens',
        'seed',
        'custom_token_bans',
@ -311,6 +315,7 @@ loaders_samplers = {
        'auto_max_new_tokens',
        'ban_eos_token',
        'add_bos_token',
+        'enable_thinking',
        'seed',
        'sampler_priority',
        'dry_sequence_breakers',
--- a/modules/logits.py
+++ b/modules/logits.py
@ -45,6 +45,9 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
            output = {}
            for entry in logprobs:
                token = repr(entry['token'])
+                if len(token) > 2 and token.startswith("'") and token.endswith("'"):
+                    token = token[1:-1]
+
                prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
                output[token] = prob
            return output
@ -52,6 +55,9 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
            output = ''
            for entry in logprobs:
                token = repr(entry['token'])
+                if len(token) > 2 and token.startswith("'") and token.endswith("'"):
+                    token = token[1:-1]
+
                prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
                output += f"{prob:.5f}  -  {token}\n"
            return output, previous
--- a/modules/shared.py
+++ b/modules/shared.py
@ -47,10 +47,11 @@ settings = {
    'max_new_tokens_max': 4096,
    'prompt_lookup_num_tokens': 0,
    'max_tokens_second': 0,
-    'max_updates_second': 0,
+    'max_updates_second': 12,
    'auto_max_new_tokens': True,
    'ban_eos_token': False,
    'add_bos_token': True,
+    'enable_thinking': True,
    'skip_special_tokens': True,
    'stream': True,
    'static_cache': False,
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -100,8 +100,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
                    last_update = cur_time
                    yield reply

-                yield reply
-
        if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything):
            break

--- a/modules/ui.py
+++ b/modules/ui.py
@ -114,6 +114,7 @@ def list_model_elements():
        'cache_type',
        'tensor_split',
        'extra_flags',
+        'streaming_llm',
        'gpu_split',
        'alpha_value',
        'rope_freq_base',
@ -198,6 +199,7 @@ def list_interface_input_elements():
        'auto_max_new_tokens',
        'ban_eos_token',
        'add_bos_token',
+        'enable_thinking',
        'skip_special_tokens',
        'stream',
        'static_cache',
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -51,7 +51,7 @@ def create_ui():
                            shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
                            shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
                            shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
-                            shared.gradio['ctx_size'] = gr.Number(label='ctx_size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
+                            shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
                            shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
                            shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
                            shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
@ -96,7 +96,7 @@ def create_ui():
                            # Speculative decoding
                            with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
                                with gr.Row():
-                                    shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', interactive=not mu)
+                                    shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu)
                                    ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)

                                shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@ -82,6 +82,7 @@ def create_ui(default_preset):
                            shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
                            shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
                            shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
+                            shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='enable_thinking', info='Used by Qwen3 to toggle <think> mode.')
                            shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
                            shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
                            shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@ -30,10 +30,10 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@ -29,6 +29,6 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@ -29,6 +29,6 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@ -29,7 +29,7 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@ -29,8 +29,8 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@ -29,5 +29,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@ -30,10 +30,10 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_amd.txt
+++ b/requirements/portable/requirements_amd.txt
@ -15,4 +15,4 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_amd_noavx2.txt
+++ b/requirements/portable/requirements_amd_noavx2.txt
@ -15,4 +15,4 @@ sse-starlette==1.6.5
 tiktoken

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@ -15,6 +15,6 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken

 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@ -15,5 +15,5 @@ sse-starlette==1.6.5
 tiktoken

 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
--- a/user_data/settings-template.yaml
+++ b/user_data/settings-template.yaml
@ -18,10 +18,11 @@ max_new_tokens_min: 1
 max_new_tokens_max: 4096
 prompt_lookup_num_tokens: 0
 max_tokens_second: 0
-max_updates_second: 0
+max_updates_second: 12
 auto_max_new_tokens: true
 ban_eos_token: false
 add_bos_token: true
+enable_thinking: true
 skip_special_tokens: true
 stream: true
 static_cache: false