Merge pull request #6939 from oobabooga/dev

Merge dev branch
This commit is contained in:
oobabooga 2025-05-01 00:15:11 -03:00 committed by GitHub
commit a41da1ec95
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
35 changed files with 210 additions and 111 deletions

View file

@ -36,7 +36,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases
#### Option 2: One-click installer #### Option 2: One-click installer
1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip). 1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip).
2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat`. 2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, or `start_macos.sh`.
3) Select your GPU vendor when asked. 3) Select your GPU vendor when asked.
4) After installation completes, browse to `http://localhost:7860`. 4) After installation completes, browse to `http://localhost:7860`.
5) Have fun! 5) Have fun!

View file

@ -2,6 +2,7 @@ import asyncio
import json import json
import logging import logging
import os import os
import socket
import traceback import traceback
from collections import deque from collections import deque
from threading import Thread from threading import Thread
@ -374,9 +375,26 @@ async def handle_unload_loras():
return JSONResponse(content="OK") return JSONResponse(content="OK")
def find_available_port(starting_port):
"""Try the starting port, then find an available one if it's taken."""
try:
# Try to create a socket with the starting port
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', starting_port))
return starting_port
except OSError:
# Port is already in use, so find a new one
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0)) # Bind to port 0 to get an available port
new_port = s.getsockname()[1]
logger.warning(f"Port {starting_port} is already in use. Using port {new_port} instead.")
return new_port
def run_server(): def run_server():
# Parse configuration # Parse configuration
port = int(os.environ.get('OPENEDAI_PORT', shared.args.api_port)) port = int(os.environ.get('OPENEDAI_PORT', shared.args.api_port))
port = find_available_port(port)
ssl_certfile = os.environ.get('OPENEDAI_CERT_PATH', shared.args.ssl_certfile) ssl_certfile = os.environ.get('OPENEDAI_CERT_PATH', shared.args.ssl_certfile)
ssl_keyfile = os.environ.get('OPENEDAI_KEY_PATH', shared.args.ssl_keyfile) ssl_keyfile = os.environ.get('OPENEDAI_KEY_PATH', shared.args.ssl_keyfile)

View file

@ -42,6 +42,7 @@ class GenerationOptions(BaseModel):
auto_max_new_tokens: bool = False auto_max_new_tokens: bool = False
ban_eos_token: bool = False ban_eos_token: bool = False
add_bos_token: bool = True add_bos_token: bool = True
enable_thinking: bool = True
skip_special_tokens: bool = True skip_special_tokens: bool = True
static_cache: bool = False static_cache: bool = False
truncation_length: int = 0 truncation_length: int = 0

View file

@ -31,13 +31,13 @@ function removeLastClick() {
} }
function handleMorphdomUpdate(text) { function handleMorphdomUpdate(text) {
// Track closed blocks // Track open blocks
const closedBlocks = new Set(); const openBlocks = new Set();
document.querySelectorAll(".thinking-block").forEach(block => { document.querySelectorAll(".thinking-block").forEach(block => {
const blockId = block.getAttribute("data-block-id"); const blockId = block.getAttribute("data-block-id");
// If block exists and is not open, add to closed set // If block exists and is open, add to open set
if (blockId && !block.hasAttribute("open")) { if (blockId && block.hasAttribute("open")) {
closedBlocks.add(blockId); openBlocks.add(blockId);
} }
}); });
@ -72,13 +72,15 @@ function handleMorphdomUpdate(text) {
} }
} }
// For thinking blocks, respect closed state // For thinking blocks, assume closed by default
if (fromEl.classList && fromEl.classList.contains("thinking-block") && if (fromEl.classList && fromEl.classList.contains("thinking-block") &&
toEl.classList && toEl.classList.contains("thinking-block")) { toEl.classList && toEl.classList.contains("thinking-block")) {
const blockId = toEl.getAttribute("data-block-id"); const blockId = toEl.getAttribute("data-block-id");
// If this block was closed by user, keep it closed // Remove open attribute by default
if (blockId && closedBlocks.has(blockId)) {
toEl.removeAttribute("open"); toEl.removeAttribute("open");
// If this block was explicitly opened by user, keep it open
if (blockId && openBlocks.has(blockId)) {
toEl.setAttribute("open", "");
} }
} }

View file

@ -90,6 +90,44 @@ def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=Tru
return prefix, suffix return prefix, suffix
def get_thinking_suppression_string(template):
"""
Determines what string needs to be added to suppress thinking mode
by comparing template renderings with thinking enabled vs disabled.
"""
# Render with thinking enabled
with_thinking = template.render(
messages=[{'role': 'user', 'content': ''}],
builtin_tools=None,
tools=None,
tools_in_user_message=False,
add_generation_prompt=True,
enable_thinking=True
)
# Render with thinking disabled
without_thinking = template.render(
messages=[{'role': 'user', 'content': ''}],
builtin_tools=None,
tools=None,
tools_in_user_message=False,
add_generation_prompt=True,
enable_thinking=False
)
# Find the difference (what gets added to suppress thinking)
i = 0
while i < min(len(with_thinking), len(without_thinking)) and with_thinking[i] == without_thinking[i]:
i += 1
j = 0
while j < min(len(with_thinking), len(without_thinking)) - i and with_thinking[-1 - j] == without_thinking[-1 - j]:
j += 1
return without_thinking[i:len(without_thinking) - j if j else None]
def generate_chat_prompt(user_input, state, **kwargs): def generate_chat_prompt(user_input, state, **kwargs):
impersonate = kwargs.get('impersonate', False) impersonate = kwargs.get('impersonate', False)
_continue = kwargs.get('_continue', False) _continue = kwargs.get('_continue', False)
@ -147,13 +185,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
if user_input and not impersonate and not _continue: if user_input and not impersonate and not _continue:
messages.append({"role": "user", "content": user_input}) messages.append({"role": "user", "content": user_input})
def remove_extra_bos(prompt):
for bos_token in ['<s>', '<|startoftext|>', '<BOS_TOKEN>', '<|endoftext|>']:
while prompt.startswith(bos_token):
prompt = prompt[len(bos_token):]
return prompt
def make_prompt(messages): def make_prompt(messages):
if state['mode'] == 'chat-instruct' and _continue: if state['mode'] == 'chat-instruct' and _continue:
prompt = renderer(messages=messages[:-1]) prompt = renderer(messages=messages[:-1])
@ -165,7 +196,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
if state['custom_system_message'].strip() != '': if state['custom_system_message'].strip() != '':
outer_messages.append({"role": "system", "content": state['custom_system_message']}) outer_messages.append({"role": "system", "content": state['custom_system_message']})
prompt = remove_extra_bos(prompt)
command = state['chat-instruct_command'] command = state['chat-instruct_command']
command = command.replace('<|character|>', state['name2'] if not impersonate else state['name1']) command = command.replace('<|character|>', state['name2'] if not impersonate else state['name1'])
command = command.replace('<|prompt|>', prompt) command = command.replace('<|prompt|>', prompt)
@ -182,11 +212,10 @@ def generate_chat_prompt(user_input, state, **kwargs):
outer_messages.append({"role": "user", "content": command}) outer_messages.append({"role": "user", "content": command})
outer_messages.append({"role": "assistant", "content": prefix}) outer_messages.append({"role": "assistant", "content": prefix})
prompt = instruction_template.render(messages=outer_messages) prompt = instruct_renderer(messages=outer_messages)
suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1] suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1]
if len(suffix) > 0: if len(suffix) > 0:
prompt = prompt[:-len(suffix)] prompt = prompt[:-len(suffix)]
else: else:
if _continue: if _continue:
suffix = get_generation_prompt(renderer, impersonate=impersonate)[1] suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
@ -199,7 +228,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
prompt += prefix prompt += prefix
prompt = remove_extra_bos(prompt) if state['mode'] == 'instruct' and not any((_continue, impersonate, state['enable_thinking'])):
prompt += get_thinking_suppression_string(instruction_template)
return prompt return prompt
prompt = make_prompt(messages) prompt = make_prompt(messages)

View file

@ -65,7 +65,7 @@ class Exllamav2HF(PreTrainedModel, GenerationMixin):
elif kv_cache_type == 'q4': elif kv_cache_type == 'q4':
cache_type = ExLlamaV2Cache_Q4 cache_type = ExLlamaV2Cache_Q4
else: else:
raise ValueError(f"Invalid cache type for ExLlamaV2: {cache_type}. Valid options are: fp16, fp8, q8, q6, q4.") raise ValueError(f"Invalid cache type for ExLlamaV2: {kv_cache_type}. Valid options are: fp16, fp8, q8, q6, q4.")
# Use TP if specified # Use TP if specified
if shared.args.enable_tp: if shared.args.enable_tp:
@ -78,12 +78,10 @@ class Exllamav2HF(PreTrainedModel, GenerationMixin):
self.past_seq = None self.past_seq = None
if shared.args.cfg_cache: if shared.args.cfg_cache:
if shared.args.cache_8bit: if shared.args.enable_tp:
self.ex_cache_negative = ExLlamaV2Cache_8bit(self.ex_model) self.ex_cache_negative = ExLlamaV2Cache_TP(self.ex_model, base=cache_type)
elif shared.args.cache_4bit:
self.ex_cache_negative = ExLlamaV2Cache_Q4(self.ex_model)
else: else:
self.ex_cache_negative = ExLlamaV2Cache(self.ex_model) self.ex_cache_negative = cache_type(self.ex_model, lazy=shared.args.autosplit)
self.past_seq_negative = None self.past_seq_negative = None

View file

@ -118,6 +118,9 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
seq_tensor = torch.tensor(seq) seq_tensor = torch.tensor(seq)
reset = True reset = True
# Maximum number of tokens to process in a single forward pass
max_chunk_size = 2048
# Make the forward call # Make the forward call
if labels is None: if labels is None:
if past_seq is not None: if past_seq is not None:
@ -131,55 +134,85 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
if longest_prefix > 0: if longest_prefix > 0:
reset = False reset = False
current_len = longest_prefix current_len = longest_prefix
if len(seq_tensor) - longest_prefix > 1: remaining_tokens = len(seq_tensor) - longest_prefix - 1
if remaining_tokens > 0:
# Process tokens from longest_prefix to second-to-last token
tokens_to_process = seq_tensor[longest_prefix:-1]
# Process in chunks if the number of tokens is large
for i in range(0, tokens_to_process.shape[0], max_chunk_size):
chunk = tokens_to_process[i:i + max_chunk_size]
self.ex_model.forward( self.ex_model.forward(
input_ids=seq_tensor[longest_prefix:-1].view(1, -1), input_ids=chunk.view(1, -1),
params={ params={
"attn_mode": "flash_attn", "attn_mode": "flash_attn",
"cache": ex_cache, "cache": ex_cache,
"past_len": longest_prefix, "past_len": longest_prefix + i,
"batch_shape": (1, self.max_tokens) "batch_shape": (1, self.max_tokens),
"reconstruct": False # Force memory-efficient path
} }
) )
current_len = longest_prefix + len(seq_tensor) - longest_prefix - 1 current_len = longest_prefix + remaining_tokens
if reset: if reset:
if len(seq_tensor) > 1: if len(seq_tensor) > 1:
# Process all tokens except the last one
tokens_to_process = seq_tensor[:-1]
# Process in chunks if the number of tokens is large
current_len = 0
for i in range(0, tokens_to_process.shape[0], max_chunk_size):
chunk = tokens_to_process[i:i + max_chunk_size]
self.ex_model.forward( self.ex_model.forward(
input_ids=seq_tensor[:-1].view(1, -1), input_ids=chunk.view(1, -1),
params={ params={
"attn_mode": "flash_attn", "attn_mode": "flash_attn",
"cache": ex_cache, "cache": ex_cache,
"past_len": 0, "past_len": current_len,
"batch_shape": (1, self.max_tokens) "batch_shape": (1, self.max_tokens),
"reconstruct": False # Force memory-efficient path
} }
) )
current_len += chunk.shape[0]
current_len = len(seq_tensor) - 1
else: else:
current_len = 0 current_len = 0
# Process the last token and get logits
logits = self.ex_model.forward( logits = self.ex_model.forward(
input_ids=seq_tensor[-1:].view(1, -1), input_ids=seq_tensor[-1:].view(1, -1),
params={ params={
"attn_mode": "flash_attn", "attn_mode": "flash_attn",
"cache": ex_cache, "cache": ex_cache,
"past_len": current_len, "past_len": current_len,
"batch_shape": (1, self.max_tokens) "batch_shape": (1, self.max_tokens),
"reconstruct": False # Force memory-efficient path
} }
).to(input_ids.device).float() ).to(input_ids.device).float()
else: else:
logits = self.ex_model.forward( # When processing with labels, handle as a complete sequence
input_ids=seq_tensor.view(1, -1), # Process in chunks if the number of tokens is large
tokens_to_process = seq_tensor
all_logits = None
for i in range(0, tokens_to_process.shape[0], max_chunk_size):
chunk = tokens_to_process[i:i + max_chunk_size]
chunk_logits = self.ex_model.forward(
input_ids=chunk.view(1, -1),
params={ params={
"attn_mode": "flash_attn", "attn_mode": "flash_attn_nc", # No caching for training
"cache": ex_cache, "reconstruct": False # Force memory-efficient path
"past_len": 0,
"batch_shape": (1, self.max_tokens)
} }
).float() ).float()
if all_logits is None:
all_logits = chunk_logits
else:
all_logits = torch.cat([all_logits, chunk_logits], dim=1)
logits = all_logits
if is_negative: if is_negative:
self.past_seq_negative = seq_tensor self.past_seq_negative = seq_tensor
else: else:

View file

@ -167,7 +167,7 @@ def convert_to_markdown(string, message_id=None):
title_text = "Thinking..." if is_streaming else "Thought" title_text = "Thinking..." if is_streaming else "Thought"
thinking_block = f''' thinking_block = f'''
<details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}" open> <details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
<summary class="thinking-header"> <summary class="thinking-header">
<svg class="thinking-icon" width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg"> <svg class="thinking-icon" width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
<path d="M8 1.33334C4.31868 1.33334 1.33334 4.31868 1.33334 8.00001C1.33334 11.6813 4.31868 14.6667 8 14.6667C11.6813 14.6667 14.6667 11.6813 14.6667 8.00001C14.6667 4.31868 11.6813 1.33334 8 1.33334Z" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/> <path d="M8 1.33334C4.31868 1.33334 1.33334 4.31868 1.33334 8.00001C1.33334 11.6813 4.31868 14.6667 8 14.6667C11.6813 14.6667 14.6667 11.6813 14.6667 8.00001C14.6667 4.31868 11.6813 1.33334 8 1.33334Z" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>

View file

@ -135,6 +135,7 @@ class LlamaServer:
"prompt": token_ids, "prompt": token_ids,
"n_predict": max_new_tokens, "n_predict": max_new_tokens,
"stream": True, "stream": True,
"cache_prompt": True
}) })
if shared.args.verbose: if shared.args.verbose:
@ -327,6 +328,11 @@ class LlamaServer:
else: else:
env['LD_LIBRARY_PATH'] = os.path.dirname(self.server_path) env['LD_LIBRARY_PATH'] = os.path.dirname(self.server_path)
if shared.args.verbose:
logger.info("llama-server command-line flags:")
print(' '.join(str(item) for item in cmd[1:]))
print()
# Start the server with pipes for output # Start the server with pipes for output
self.process = subprocess.Popen( self.process = subprocess.Popen(
cmd, cmd,
@ -340,9 +346,7 @@ class LlamaServer:
# Wait for server to be healthy # Wait for server to be healthy
health_url = f"http://127.0.0.1:{self.port}/health" health_url = f"http://127.0.0.1:{self.port}/health"
start_time = time.time() while True:
timeout = 3600 * 8 # 8 hours
while time.time() - start_time < timeout:
# Check if process is still alive # Check if process is still alive
if self.process.poll() is not None: if self.process.poll() is not None:
# Process has terminated # Process has terminated
@ -357,8 +361,6 @@ class LlamaServer:
pass pass
time.sleep(1) time.sleep(1)
else:
raise TimeoutError(f"Server health check timed out after {timeout} seconds")
# Server is now healthy, get model info # Server is now healthy, get model info
self._get_vocabulary_size() self._get_vocabulary_size()

View file

@ -143,6 +143,7 @@ def transformers_samplers():
'auto_max_new_tokens', 'auto_max_new_tokens',
'ban_eos_token', 'ban_eos_token',
'add_bos_token', 'add_bos_token',
'enable_thinking',
'skip_special_tokens', 'skip_special_tokens',
'static_cache', 'static_cache',
'seed', 'seed',
@ -195,6 +196,7 @@ loaders_samplers = {
'auto_max_new_tokens', 'auto_max_new_tokens',
'ban_eos_token', 'ban_eos_token',
'add_bos_token', 'add_bos_token',
'enable_thinking',
'skip_special_tokens', 'skip_special_tokens',
'seed', 'seed',
'sampler_priority', 'sampler_priority',
@ -241,6 +243,7 @@ loaders_samplers = {
'auto_max_new_tokens', 'auto_max_new_tokens',
'ban_eos_token', 'ban_eos_token',
'add_bos_token', 'add_bos_token',
'enable_thinking',
'skip_special_tokens', 'skip_special_tokens',
'seed', 'seed',
'sampler_priority', 'sampler_priority',
@ -279,6 +282,7 @@ loaders_samplers = {
'auto_max_new_tokens', 'auto_max_new_tokens',
'ban_eos_token', 'ban_eos_token',
'add_bos_token', 'add_bos_token',
'enable_thinking',
'skip_special_tokens', 'skip_special_tokens',
'seed', 'seed',
'custom_token_bans', 'custom_token_bans',
@ -311,6 +315,7 @@ loaders_samplers = {
'auto_max_new_tokens', 'auto_max_new_tokens',
'ban_eos_token', 'ban_eos_token',
'add_bos_token', 'add_bos_token',
'enable_thinking',
'seed', 'seed',
'sampler_priority', 'sampler_priority',
'dry_sequence_breakers', 'dry_sequence_breakers',

View file

@ -45,6 +45,9 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
output = {} output = {}
for entry in logprobs: for entry in logprobs:
token = repr(entry['token']) token = repr(entry['token'])
if len(token) > 2 and token.startswith("'") and token.endswith("'"):
token = token[1:-1]
prob = entry['prob'] if use_samplers else np.exp(entry['logprob']) prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
output[token] = prob output[token] = prob
return output return output
@ -52,6 +55,9 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
output = '' output = ''
for entry in logprobs: for entry in logprobs:
token = repr(entry['token']) token = repr(entry['token'])
if len(token) > 2 and token.startswith("'") and token.endswith("'"):
token = token[1:-1]
prob = entry['prob'] if use_samplers else np.exp(entry['logprob']) prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
output += f"{prob:.5f} - {token}\n" output += f"{prob:.5f} - {token}\n"
return output, previous return output, previous

View file

@ -47,10 +47,11 @@ settings = {
'max_new_tokens_max': 4096, 'max_new_tokens_max': 4096,
'prompt_lookup_num_tokens': 0, 'prompt_lookup_num_tokens': 0,
'max_tokens_second': 0, 'max_tokens_second': 0,
'max_updates_second': 0, 'max_updates_second': 12,
'auto_max_new_tokens': True, 'auto_max_new_tokens': True,
'ban_eos_token': False, 'ban_eos_token': False,
'add_bos_token': True, 'add_bos_token': True,
'enable_thinking': True,
'skip_special_tokens': True, 'skip_special_tokens': True,
'stream': True, 'stream': True,
'static_cache': False, 'static_cache': False,

View file

@ -100,8 +100,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
last_update = cur_time last_update = cur_time
yield reply yield reply
yield reply
if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything): if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything):
break break

View file

@ -114,6 +114,7 @@ def list_model_elements():
'cache_type', 'cache_type',
'tensor_split', 'tensor_split',
'extra_flags', 'extra_flags',
'streaming_llm',
'gpu_split', 'gpu_split',
'alpha_value', 'alpha_value',
'rope_freq_base', 'rope_freq_base',
@ -198,6 +199,7 @@ def list_interface_input_elements():
'auto_max_new_tokens', 'auto_max_new_tokens',
'ban_eos_token', 'ban_eos_token',
'add_bos_token', 'add_bos_token',
'enable_thinking',
'skip_special_tokens', 'skip_special_tokens',
'stream', 'stream',
'static_cache', 'static_cache',

View file

@ -51,7 +51,7 @@ def create_ui():
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size) shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
shared.gradio['ctx_size'] = gr.Number(label='ctx_size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.') shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40') shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
@ -96,7 +96,7 @@ def create_ui():
# Speculative decoding # Speculative decoding
with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']: with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
with gr.Row(): with gr.Row():
shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', interactive=not mu) shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.') shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')

View file

@ -82,6 +82,7 @@ def create_ui(default_preset):
shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.') shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.') shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.') shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='enable_thinking', info='Used by Qwen3 to toggle <think> mode.')
shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.') shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming') shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.') shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')

View file

@ -30,10 +30,10 @@ sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"

View file

@ -29,6 +29,6 @@ sse-starlette==1.6.5
tiktoken tiktoken
# AMD wheels # AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -29,6 +29,6 @@ sse-starlette==1.6.5
tiktoken tiktoken
# AMD wheels # AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -29,7 +29,7 @@ sse-starlette==1.6.5
tiktoken tiktoken
# Mac wheels # Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -29,8 +29,8 @@ sse-starlette==1.6.5
tiktoken tiktoken
# Mac wheels # Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -29,5 +29,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# llama.cpp (CPU only, AVX2) # llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -29,5 +29,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# llama.cpp (CPU only, no AVX2) # llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -30,10 +30,10 @@ sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,4 +15,4 @@ sse-starlette==1.6.5
tiktoken tiktoken
# AMD wheels # AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,4 +15,4 @@ sse-starlette==1.6.5
tiktoken tiktoken
# AMD wheels # AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# Mac wheels # Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"

View file

@ -15,6 +15,6 @@ sse-starlette==1.6.5
tiktoken tiktoken
# Mac wheels # Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# llama.cpp (CPU only, AVX2) # llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# llama.cpp (CPU only, no AVX2) # llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken tiktoken
# CUDA wheels # CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -18,10 +18,11 @@ max_new_tokens_min: 1
max_new_tokens_max: 4096 max_new_tokens_max: 4096
prompt_lookup_num_tokens: 0 prompt_lookup_num_tokens: 0
max_tokens_second: 0 max_tokens_second: 0
max_updates_second: 0 max_updates_second: 12
auto_max_new_tokens: true auto_max_new_tokens: true
ban_eos_token: false ban_eos_token: false
add_bos_token: true add_bos_token: true
enable_thinking: true
skip_special_tokens: true skip_special_tokens: true
stream: true stream: true
static_cache: false static_cache: false