From ee0592473c16db1917c98a24ec24dce36fd00bed Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 27 Apr 2025 21:04:02 -0700 Subject: [PATCH 01/22] Fix ExLlamaV3_HF leaking memory (attempt) --- modules/exllamav3_hf.py | 95 +++++++++++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 31 deletions(-) diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py index f15fc0b2..12b22f64 100644 --- a/modules/exllamav3_hf.py +++ b/modules/exllamav3_hf.py @@ -118,6 +118,9 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): seq_tensor = torch.tensor(seq) reset = True + # Maximum number of tokens to process in a single forward pass + max_chunk_size = 2048 + # Make the forward call if labels is None: if past_seq is not None: @@ -131,54 +134,84 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin): if longest_prefix > 0: reset = False current_len = longest_prefix - if len(seq_tensor) - longest_prefix > 1: - self.ex_model.forward( - input_ids=seq_tensor[longest_prefix:-1].view(1, -1), - params={ - "attn_mode": "flash_attn", - "cache": ex_cache, - "past_len": longest_prefix, - "batch_shape": (1, self.max_tokens) - } - ) + remaining_tokens = len(seq_tensor) - longest_prefix - 1 - current_len = longest_prefix + len(seq_tensor) - longest_prefix - 1 + if remaining_tokens > 0: + # Process tokens from longest_prefix to second-to-last token + tokens_to_process = seq_tensor[longest_prefix:-1] + + # Process in chunks if the number of tokens is large + for i in range(0, tokens_to_process.shape[0], max_chunk_size): + chunk = tokens_to_process[i:i + max_chunk_size] + self.ex_model.forward( + input_ids=chunk.view(1, -1), + params={ + "attn_mode": "flash_attn", + "cache": ex_cache, + "past_len": longest_prefix + i, + "batch_shape": (1, self.max_tokens), + "reconstruct": False # Force memory-efficient path + } + ) + + current_len = longest_prefix + remaining_tokens if reset: if len(seq_tensor) > 1: - self.ex_model.forward( - input_ids=seq_tensor[:-1].view(1, -1), - params={ - "attn_mode": "flash_attn", - "cache": ex_cache, - "past_len": 0, - "batch_shape": (1, self.max_tokens) - } - ) + # Process all tokens except the last one + tokens_to_process = seq_tensor[:-1] - current_len = len(seq_tensor) - 1 + # Process in chunks if the number of tokens is large + current_len = 0 + for i in range(0, tokens_to_process.shape[0], max_chunk_size): + chunk = tokens_to_process[i:i + max_chunk_size] + self.ex_model.forward( + input_ids=chunk.view(1, -1), + params={ + "attn_mode": "flash_attn", + "cache": ex_cache, + "past_len": current_len, + "batch_shape": (1, self.max_tokens), + "reconstruct": False # Force memory-efficient path + } + ) + current_len += chunk.shape[0] else: current_len = 0 + # Process the last token and get logits logits = self.ex_model.forward( input_ids=seq_tensor[-1:].view(1, -1), params={ "attn_mode": "flash_attn", "cache": ex_cache, "past_len": current_len, - "batch_shape": (1, self.max_tokens) + "batch_shape": (1, self.max_tokens), + "reconstruct": False # Force memory-efficient path } ).to(input_ids.device).float() else: - logits = self.ex_model.forward( - input_ids=seq_tensor.view(1, -1), - params={ - "attn_mode": "flash_attn", - "cache": ex_cache, - "past_len": 0, - "batch_shape": (1, self.max_tokens) - } - ).float() + # When processing with labels, handle as a complete sequence + # Process in chunks if the number of tokens is large + tokens_to_process = seq_tensor + all_logits = None + + for i in range(0, tokens_to_process.shape[0], max_chunk_size): + chunk = tokens_to_process[i:i + max_chunk_size] + chunk_logits = self.ex_model.forward( + input_ids=chunk.view(1, -1), + params={ + "attn_mode": "flash_attn_nc", # No caching for training + "reconstruct": False # Force memory-efficient path + } + ).float() + + if all_logits is None: + all_logits = chunk_logits + else: + all_logits = torch.cat([all_logits, chunk_logits], dim=1) + + logits = all_logits if is_negative: self.past_seq_negative = seq_tensor From bbcaec75b47b1b2495623ff38181478c8ba659f8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 27 Apr 2025 21:13:16 -0700 Subject: [PATCH 02/22] API: Find a new port if the default one is taken (closes #6918) --- extensions/openai/script.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/extensions/openai/script.py b/extensions/openai/script.py index c2dc337b..a995da9d 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -2,6 +2,7 @@ import asyncio import json import logging import os +import socket import traceback from collections import deque from threading import Thread @@ -374,9 +375,26 @@ async def handle_unload_loras(): return JSONResponse(content="OK") +def find_available_port(starting_port): + """Try the starting port, then find an available one if it's taken.""" + try: + # Try to create a socket with the starting port + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(('', starting_port)) + return starting_port + except OSError: + # Port is already in use, so find a new one + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(('', 0)) # Bind to port 0 to get an available port + new_port = s.getsockname()[1] + logger.warning(f"Port {starting_port} is already in use. Using port {new_port} instead.") + return new_port + + def run_server(): # Parse configuration port = int(os.environ.get('OPENEDAI_PORT', shared.args.api_port)) + port = find_available_port(port) ssl_certfile = os.environ.get('OPENEDAI_CERT_PATH', shared.args.ssl_certfile) ssl_keyfile = os.environ.get('OPENEDAI_KEY_PATH', shared.args.ssl_keyfile) From c6c2855c80e6cf6ebd55602f6fe43d80aaca7fae Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 27 Apr 2025 21:22:21 -0700 Subject: [PATCH 03/22] llama.cpp: Remove the timeout while loading models (closes #6907) --- modules/llama_cpp_server.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 9572d5aa..4b01c228 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -340,9 +340,7 @@ class LlamaServer: # Wait for server to be healthy health_url = f"http://127.0.0.1:{self.port}/health" - start_time = time.time() - timeout = 3600 * 8 # 8 hours - while time.time() - start_time < timeout: + while True: # Check if process is still alive if self.process.poll() is not None: # Process has terminated @@ -357,8 +355,6 @@ class LlamaServer: pass time.sleep(1) - else: - raise TimeoutError(f"Server health check timed out after {timeout} seconds") # Server is now healthy, get model info self._get_vocabulary_size() From be13f5199be5059d9c17a4008642c66f2700fe69 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 27 Apr 2025 21:40:38 -0700 Subject: [PATCH 04/22] UI: Add an info message about how to use Speculative Decoding --- modules/ui_model_menu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index e3cf2ba6..c9679fad 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -96,7 +96,7 @@ def create_ui(): # Speculative decoding with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']: with gr.Row(): - shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', interactive=not mu) + shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu) ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.') From 15a29e99f8501b229f0f975281e4973870e229df Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 27 Apr 2025 21:41:34 -0700 Subject: [PATCH 05/22] Lint --- modules/ui_model_menu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index c9679fad..e0ae06f9 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -96,7 +96,7 @@ def create_ui(): # Speculative decoding with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']: with gr.Row(): - shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu) + shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu) ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.') From 1ee0acc852b7ac3e93cd1996414736bb75cfee59 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 28 Apr 2025 15:56:25 -0700 Subject: [PATCH 06/22] llama.cpp: Make --verbose print the llama-server command --- modules/llama_cpp_server.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 4b01c228..1f2db670 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -327,6 +327,11 @@ class LlamaServer: else: env['LD_LIBRARY_PATH'] = os.path.dirname(self.server_path) + if shared.args.verbose: + logger.info(f"llama-server command-line flags:") + print(' '.join(str(item) for item in cmd[1:])) + print() + # Start the server with pipes for output self.process = subprocess.Popen( cmd, From d10bded7f807ad3eff1e5dc43370d8f9e09faf1d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 28 Apr 2025 22:37:01 -0700 Subject: [PATCH 07/22] UI: Add an `enable_thinking` option to enable/disable Qwen3 thinking --- extensions/openai/typing.py | 1 + modules/chat.py | 53 +++++++++++++++++++++++++------- modules/loaders.py | 5 +++ modules/shared.py | 1 + modules/ui.py | 1 + modules/ui_parameters.py | 1 + user_data/settings-template.yaml | 1 + 7 files changed, 52 insertions(+), 11 deletions(-) diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py index 4d6018f9..b1979cbc 100644 --- a/extensions/openai/typing.py +++ b/extensions/openai/typing.py @@ -42,6 +42,7 @@ class GenerationOptions(BaseModel): auto_max_new_tokens: bool = False ban_eos_token: bool = False add_bos_token: bool = True + enable_thinking: bool = True skip_special_tokens: bool = True static_cache: bool = False truncation_length: int = 0 diff --git a/modules/chat.py b/modules/chat.py index e117e6ee..98913d5c 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -90,6 +90,44 @@ def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=Tru return prefix, suffix +def get_thinking_suppression_string(template): + """ + Determines what string needs to be added to suppress thinking mode + by comparing template renderings with thinking enabled vs disabled. + """ + + # Render with thinking enabled + with_thinking = template.render( + messages=[{'role': 'user', 'content': ''}], + builtin_tools=None, + tools=None, + tools_in_user_message=False, + add_generation_prompt=True, + enable_thinking=True + ) + + # Render with thinking disabled + without_thinking = template.render( + messages=[{'role': 'user', 'content': ''}], + builtin_tools=None, + tools=None, + tools_in_user_message=False, + add_generation_prompt=True, + enable_thinking=False + ) + + # Find the difference (what gets added to suppress thinking) + i = 0 + while i < min(len(with_thinking), len(without_thinking)) and with_thinking[i] == without_thinking[i]: + i += 1 + + j = 0 + while j < min(len(with_thinking), len(without_thinking)) - i and with_thinking[-1 - j] == without_thinking[-1 - j]: + j += 1 + + return without_thinking[i:len(without_thinking) - j if j else None] + + def generate_chat_prompt(user_input, state, **kwargs): impersonate = kwargs.get('impersonate', False) _continue = kwargs.get('_continue', False) @@ -147,13 +185,6 @@ def generate_chat_prompt(user_input, state, **kwargs): if user_input and not impersonate and not _continue: messages.append({"role": "user", "content": user_input}) - def remove_extra_bos(prompt): - for bos_token in ['', '<|startoftext|>', '', '<|endoftext|>']: - while prompt.startswith(bos_token): - prompt = prompt[len(bos_token):] - - return prompt - def make_prompt(messages): if state['mode'] == 'chat-instruct' and _continue: prompt = renderer(messages=messages[:-1]) @@ -165,7 +196,6 @@ def generate_chat_prompt(user_input, state, **kwargs): if state['custom_system_message'].strip() != '': outer_messages.append({"role": "system", "content": state['custom_system_message']}) - prompt = remove_extra_bos(prompt) command = state['chat-instruct_command'] command = command.replace('<|character|>', state['name2'] if not impersonate else state['name1']) command = command.replace('<|prompt|>', prompt) @@ -182,11 +212,10 @@ def generate_chat_prompt(user_input, state, **kwargs): outer_messages.append({"role": "user", "content": command}) outer_messages.append({"role": "assistant", "content": prefix}) - prompt = instruction_template.render(messages=outer_messages) + prompt = instruct_renderer(messages=outer_messages) suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1] if len(suffix) > 0: prompt = prompt[:-len(suffix)] - else: if _continue: suffix = get_generation_prompt(renderer, impersonate=impersonate)[1] @@ -199,7 +228,9 @@ def generate_chat_prompt(user_input, state, **kwargs): prompt += prefix - prompt = remove_extra_bos(prompt) + if state['mode'] == 'instruct' and not any((_continue, impersonate, state['enable_thinking'])): + prompt += get_thinking_suppression_string(instruction_template) + return prompt prompt = make_prompt(messages) diff --git a/modules/loaders.py b/modules/loaders.py index b8ae82d7..738198b1 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -143,6 +143,7 @@ def transformers_samplers(): 'auto_max_new_tokens', 'ban_eos_token', 'add_bos_token', + 'enable_thinking', 'skip_special_tokens', 'static_cache', 'seed', @@ -195,6 +196,7 @@ loaders_samplers = { 'auto_max_new_tokens', 'ban_eos_token', 'add_bos_token', + 'enable_thinking', 'skip_special_tokens', 'seed', 'sampler_priority', @@ -241,6 +243,7 @@ loaders_samplers = { 'auto_max_new_tokens', 'ban_eos_token', 'add_bos_token', + 'enable_thinking', 'skip_special_tokens', 'seed', 'sampler_priority', @@ -279,6 +282,7 @@ loaders_samplers = { 'auto_max_new_tokens', 'ban_eos_token', 'add_bos_token', + 'enable_thinking', 'skip_special_tokens', 'seed', 'custom_token_bans', @@ -311,6 +315,7 @@ loaders_samplers = { 'auto_max_new_tokens', 'ban_eos_token', 'add_bos_token', + 'enable_thinking', 'seed', 'sampler_priority', 'dry_sequence_breakers', diff --git a/modules/shared.py b/modules/shared.py index 5d9dd362..4c1179e3 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -51,6 +51,7 @@ settings = { 'auto_max_new_tokens': True, 'ban_eos_token': False, 'add_bos_token': True, + 'enable_thinking': True, 'skip_special_tokens': True, 'stream': True, 'static_cache': False, diff --git a/modules/ui.py b/modules/ui.py index f137e62d..4105f53b 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -198,6 +198,7 @@ def list_interface_input_elements(): 'auto_max_new_tokens', 'ban_eos_token', 'add_bos_token', + 'enable_thinking', 'skip_special_tokens', 'stream', 'static_cache', diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index 6c2715af..3f609d71 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -82,6 +82,7 @@ def create_ui(default_preset): shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.') shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.') shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.') + shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='enable_thinking', info='Used by Qwen3 to toggle mode.') shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.') shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming') shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.') diff --git a/user_data/settings-template.yaml b/user_data/settings-template.yaml index 83764f97..80923276 100644 --- a/user_data/settings-template.yaml +++ b/user_data/settings-template.yaml @@ -22,6 +22,7 @@ max_updates_second: 0 auto_max_new_tokens: true ban_eos_token: false add_bos_token: true +enable_thinking: true skip_special_tokens: true stream: true static_cache: false From c5fb51e5d12322bfb08458060913d9d1c90c5318 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 28 Apr 2025 22:39:42 -0700 Subject: [PATCH 08/22] Update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c8ac8a86..08ffe37d 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases #### Option 2: One-click installer 1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip). -2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat`. +2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, or `start_macos.sh`. 3) Select your GPU vendor when asked. 4) After installation completes, browse to `http://localhost:7860`. 5) Have fun! From 1dd4aedbe1edcc8fbfd7e7be07f170dbfaa7f0cf Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 29 Apr 2025 05:28:46 -0700 Subject: [PATCH 09/22] Fix the streaming_llm UI checkbox not being interactive --- modules/ui.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/ui.py b/modules/ui.py index 4105f53b..fb016f87 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -114,6 +114,7 @@ def list_model_elements(): 'cache_type', 'tensor_split', 'extra_flags', + 'streaming_llm', 'gpu_split', 'alpha_value', 'rope_freq_base', From c5fe92d1526cfe30825194f603acf2db38c964ec Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 30 Apr 2025 05:24:58 -0700 Subject: [PATCH 10/22] Bump llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_amd_noavx2.txt | 2 +- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_amd.txt | 2 +- requirements/portable/requirements_amd_noavx2.txt | 2 +- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 18 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index c20c161e..ac441e63 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 437da5b5..b0da6c64 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -29,6 +29,6 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index b1c87990..1a5191f5 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -29,6 +29,6 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index e62987b0..ff2799f6 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index f7a9f114..a5ca209e 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -29,8 +29,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index b8cd8390..dfef9add 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 3b52d59b..5b6e1124 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index a04e8979..baecb103 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 5c717343..c6855bc8 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt index b616193d..a7754ece 100644 --- a/requirements/portable/requirements_amd.txt +++ b/requirements/portable/requirements_amd.txt @@ -15,4 +15,4 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt index de4740c9..72b1e57a 100644 --- a/requirements/portable/requirements_amd_noavx2.txt +++ b/requirements/portable/requirements_amd_noavx2.txt @@ -15,4 +15,4 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 6310327d..3e0401c9 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index f69b58e7..b7efcaf0 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -15,6 +15,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index dafa6bbe..26ebcd43 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index c02191eb..54ebf5b4 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 456188b4..31b2bb3c 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 7e733967..7f1b4c51 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 0329a598..29bb1e06 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From 7f49e3c3ce6a09ffdd52526e6d98b3a886ccb47e Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 30 Apr 2025 05:25:09 -0700 Subject: [PATCH 11/22] Bump ExLlamaV3 --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_apple_intel.txt | 2 +- requirements/full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_noavx2.txt | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index ac441e63..038c384b 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -32,8 +32,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index ff2799f6..e9a54c33 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -31,5 +31,5 @@ tiktoken # Mac wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index a5ca209e..cf8b3d7e 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -32,5 +32,5 @@ tiktoken https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index baecb103..87692075 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -32,8 +32,8 @@ tiktoken # CUDA wheels https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" From 771d3d8ed640bd412edfd597420213ffb1e6fc78 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 30 Apr 2025 06:48:32 -0700 Subject: [PATCH 12/22] Fix getting the llama.cpp logprobs for Qwen3-30B-A3B --- modules/logits.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/logits.py b/modules/logits.py index 9a4243ff..32aef7ae 100644 --- a/modules/logits.py +++ b/modules/logits.py @@ -45,6 +45,9 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur output = {} for entry in logprobs: token = repr(entry['token']) + if len(token) > 2 and token.startswith("'") and token.endswith("'"): + token = token[1:-1] + prob = entry['prob'] if use_samplers else np.exp(entry['logprob']) output[token] = prob return output @@ -52,6 +55,9 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur output = '' for entry in logprobs: token = repr(entry['token']) + if len(token) > 2 and token.startswith("'") and token.endswith("'"): + token = token[1:-1] + prob = entry['prob'] if use_samplers else np.exp(entry['logprob']) output += f"{prob:.5f} - {token}\n" return output, previous From e9569c3984c4f439d32425727cc1b2b7fc1178de Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 30 Apr 2025 06:57:23 -0700 Subject: [PATCH 13/22] Fixes after c5fe92d1526cfe30825194f603acf2db38c964ec --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_amd_noavx2.txt | 2 +- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_amd.txt | 2 +- requirements/portable/requirements_amd_noavx2.txt | 2 +- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 18 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 038c384b..532b6f9d 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index b0da6c64..e887a5eb 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -29,6 +29,6 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 1a5191f5..53a036de 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -29,6 +29,6 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index e9a54c33..0c7f8671 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index cf8b3d7e..c9682d01 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -29,8 +29,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index dfef9add..f758fe8e 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 5b6e1124..7a319ccf 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 87692075..5909c251 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index c6855bc8..8f03b485 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt index a7754ece..c91d998c 100644 --- a/requirements/portable/requirements_amd.txt +++ b/requirements/portable/requirements_amd.txt @@ -15,4 +15,4 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt index 72b1e57a..b2154c9f 100644 --- a/requirements/portable/requirements_amd_noavx2.txt +++ b/requirements/portable/requirements_amd_noavx2.txt @@ -15,4 +15,4 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 3e0401c9..81ad7b36 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index b7efcaf0..f17dccf5 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -15,6 +15,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 26ebcd43..076f1b9c 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 54ebf5b4..38134ccf 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 31b2bb3c..4bbebc4e 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 7f1b4c51..58f9f985 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 29bb1e06..7e5080d5 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From a4bf3397240e3f52594186918da37a3224450221 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 30 Apr 2025 11:13:14 -0700 Subject: [PATCH 14/22] Bump llama.cpp --- requirements/full/requirements.txt | 4 ++-- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_amd_noavx2.txt | 2 +- requirements/full/requirements_apple_intel.txt | 4 ++-- requirements/full/requirements_apple_silicon.txt | 6 +++--- requirements/full/requirements_cpu_only.txt | 4 ++-- requirements/full/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/full/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements.txt | 4 ++-- requirements/portable/requirements_amd.txt | 2 +- requirements/portable/requirements_amd_noavx2.txt | 2 +- requirements/portable/requirements_apple_intel.txt | 4 ++-- requirements/portable/requirements_apple_silicon.txt | 6 +++--- requirements/portable/requirements_cpu_only.txt | 4 ++-- requirements/portable/requirements_cpu_only_noavx2.txt | 4 ++-- requirements/portable/requirements_noavx2.txt | 4 ++-- requirements/portable/requirements_vulkan.txt | 4 ++-- requirements/portable/requirements_vulkan_noavx2.txt | 4 ++-- 18 files changed, 34 insertions(+), 34 deletions(-) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 532b6f9d..6f265eba 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index e887a5eb..c8e75ee7 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -29,6 +29,6 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 53a036de..e54d6d9c 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -29,6 +29,6 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 0c7f8671..d714ea3d 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -29,7 +29,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index c9682d01..89f4f576 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -29,8 +29,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index f758fe8e..47ad5759 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 7a319ccf..334f11df 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -29,5 +29,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 5909c251..e216c9cd 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -30,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 8f03b485..c720daa7 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt index c91d998c..7d9c00c0 100644 --- a/requirements/portable/requirements_amd.txt +++ b/requirements/portable/requirements_amd.txt @@ -15,4 +15,4 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt index b2154c9f..d718c1b1 100644 --- a/requirements/portable/requirements_amd_noavx2.txt +++ b/requirements/portable/requirements_amd_noavx2.txt @@ -15,4 +15,4 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 81ad7b36..9e184b53 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index f17dccf5..ec059716 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -15,6 +15,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 076f1b9c..d473b824 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 38134ccf..d3fffb43 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 4bbebc4e..cdfa6a01 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 58f9f985..1a7ce6ed 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 7e5080d5..4737321d 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -15,5 +15,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.7.0/llama_cpp_binaries-0.7.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From b46ca01340dfc096e88147e4f9dd3f971708d635 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 30 Apr 2025 14:53:15 -0700 Subject: [PATCH 15/22] UI: Set max_updates_second to 12 by default When the tokens/second at at ~50 and the model is a thinking model, the markdown rendering for the streaming message becomes a CPU bottleneck. --- modules/shared.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/shared.py b/modules/shared.py index 4c1179e3..fb10c014 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -47,7 +47,7 @@ settings = { 'max_new_tokens_max': 4096, 'prompt_lookup_num_tokens': 0, 'max_tokens_second': 0, - 'max_updates_second': 0, + 'max_updates_second': 12, 'auto_max_new_tokens': True, 'ban_eos_token': False, 'add_bos_token': True, From cd5c32dc197a6f62c7e328a36507eb202ff6b6a5 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 30 Apr 2025 14:54:05 -0700 Subject: [PATCH 16/22] UI: Fix max_updates_second not working --- modules/text_generation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/text_generation.py b/modules/text_generation.py index 4e3d1d7a..8d091868 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -100,8 +100,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap last_update = cur_time yield reply - yield reply - if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything): break From 195a45c6e15c2af2075ea859caadbb6076497695 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 30 Apr 2025 15:12:46 -0700 Subject: [PATCH 17/22] UI: Make thinking blocks closed by default --- js/global_scope_js.js | 22 ++++++++++++---------- modules/html_generator.py | 2 +- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/js/global_scope_js.js b/js/global_scope_js.js index e808c473..29d2d8bd 100644 --- a/js/global_scope_js.js +++ b/js/global_scope_js.js @@ -31,13 +31,13 @@ function removeLastClick() { } function handleMorphdomUpdate(text) { - // Track closed blocks - const closedBlocks = new Set(); + // Track open blocks + const openBlocks = new Set(); document.querySelectorAll(".thinking-block").forEach(block => { const blockId = block.getAttribute("data-block-id"); - // If block exists and is not open, add to closed set - if (blockId && !block.hasAttribute("open")) { - closedBlocks.add(blockId); + // If block exists and is open, add to open set + if (blockId && block.hasAttribute("open")) { + openBlocks.add(blockId); } }); @@ -72,13 +72,15 @@ function handleMorphdomUpdate(text) { } } - // For thinking blocks, respect closed state + // For thinking blocks, assume closed by default if (fromEl.classList && fromEl.classList.contains("thinking-block") && - toEl.classList && toEl.classList.contains("thinking-block")) { + toEl.classList && toEl.classList.contains("thinking-block")) { const blockId = toEl.getAttribute("data-block-id"); - // If this block was closed by user, keep it closed - if (blockId && closedBlocks.has(blockId)) { - toEl.removeAttribute("open"); + // Remove open attribute by default + toEl.removeAttribute("open"); + // If this block was explicitly opened by user, keep it open + if (blockId && openBlocks.has(blockId)) { + toEl.setAttribute("open", ""); } } diff --git a/modules/html_generator.py b/modules/html_generator.py index c5252c26..67d15b6e 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -167,7 +167,7 @@ def convert_to_markdown(string, message_id=None): title_text = "Thinking..." if is_streaming else "Thought" thinking_block = f''' -
+
From a6c3ec2299fe6dc766bf661ab8171781c8167245 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 30 Apr 2025 15:24:07 -0700 Subject: [PATCH 18/22] llama.cpp: Explicitly send cache_prompt = True --- modules/llama_cpp_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 1f2db670..71d498b1 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -135,6 +135,7 @@ class LlamaServer: "prompt": token_ids, "n_predict": max_new_tokens, "stream": True, + "cache_prompt": True }) if shared.args.verbose: From ec2e641749f3e48cb721cf4e860caf206038c711 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 30 Apr 2025 15:25:26 -0700 Subject: [PATCH 19/22] Update settings-template.yaml --- user_data/settings-template.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/user_data/settings-template.yaml b/user_data/settings-template.yaml index 80923276..20896da3 100644 --- a/user_data/settings-template.yaml +++ b/user_data/settings-template.yaml @@ -18,7 +18,7 @@ max_new_tokens_min: 1 max_new_tokens_max: 4096 prompt_lookup_num_tokens: 0 max_tokens_second: 0 -max_updates_second: 0 +max_updates_second: 12 auto_max_new_tokens: true ban_eos_token: false add_bos_token: true From 55283bb8f1a1f7fcef65b8f35ee81658e35a46af Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 30 Apr 2025 18:43:45 -0700 Subject: [PATCH 20/22] Fix CFG with ExLlamaV2_HF (closes #6937) --- modules/exllamav2_hf.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py index eb801940..4aa46375 100644 --- a/modules/exllamav2_hf.py +++ b/modules/exllamav2_hf.py @@ -65,7 +65,7 @@ class Exllamav2HF(PreTrainedModel, GenerationMixin): elif kv_cache_type == 'q4': cache_type = ExLlamaV2Cache_Q4 else: - raise ValueError(f"Invalid cache type for ExLlamaV2: {cache_type}. Valid options are: fp16, fp8, q8, q6, q4.") + raise ValueError(f"Invalid cache type for ExLlamaV2: {kv_cache_type}. Valid options are: fp16, fp8, q8, q6, q4.") # Use TP if specified if shared.args.enable_tp: @@ -78,12 +78,10 @@ class Exllamav2HF(PreTrainedModel, GenerationMixin): self.past_seq = None if shared.args.cfg_cache: - if shared.args.cache_8bit: - self.ex_cache_negative = ExLlamaV2Cache_8bit(self.ex_model) - elif shared.args.cache_4bit: - self.ex_cache_negative = ExLlamaV2Cache_Q4(self.ex_model) + if shared.args.enable_tp: + self.ex_cache_negative = ExLlamaV2Cache_TP(self.ex_model, base=cache_type) else: - self.ex_cache_negative = ExLlamaV2Cache(self.ex_model) + self.ex_cache_negative = cache_type(self.ex_model, lazy=shared.args.autosplit) self.past_seq_negative = None From 307d13b540a0a32a93ed6b8b68c3a828a2576ddf Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 30 Apr 2025 18:58:14 -0700 Subject: [PATCH 21/22] UI: Minor label change --- modules/ui_model_menu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index e0ae06f9..d13bcff7 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -51,7 +51,7 @@ def create_ui(): shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size) shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) - shared.gradio['ctx_size'] = gr.Number(label='ctx_size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.') + shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.') shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') From b950a0c6dbc6c61d2dcfdd61c38a3885e3c072a3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 30 Apr 2025 20:02:10 -0700 Subject: [PATCH 22/22] Lint --- modules/llama_cpp_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 71d498b1..d9187db8 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -329,7 +329,7 @@ class LlamaServer: env['LD_LIBRARY_PATH'] = os.path.dirname(self.server_path) if shared.args.verbose: - logger.info(f"llama-server command-line flags:") + logger.info("llama-server command-line flags:") print(' '.join(str(item) for item in cmd[1:])) print()