Merge pull request #6939 from oobabooga/dev

Merge dev branch
This commit is contained in:
oobabooga 2025-05-01 00:15:11 -03:00 committed by GitHub
commit a41da1ec95
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
35 changed files with 210 additions and 111 deletions

View file

@ -36,7 +36,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases
#### Option 2: One-click installer
1) Clone or [download the repository](https://github.com/oobabooga/text-generation-webui/archive/refs/heads/main.zip).
2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, `start_macos.sh`, or `start_wsl.bat`.
2) Run the script that matches your OS: `start_linux.sh`, `start_windows.bat`, or `start_macos.sh`.
3) Select your GPU vendor when asked.
4) After installation completes, browse to `http://localhost:7860`.
5) Have fun!

View file

@ -2,6 +2,7 @@ import asyncio
import json
import logging
import os
import socket
import traceback
from collections import deque
from threading import Thread
@ -374,9 +375,26 @@ async def handle_unload_loras():
return JSONResponse(content="OK")
def find_available_port(starting_port):
"""Try the starting port, then find an available one if it's taken."""
try:
# Try to create a socket with the starting port
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', starting_port))
return starting_port
except OSError:
# Port is already in use, so find a new one
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0)) # Bind to port 0 to get an available port
new_port = s.getsockname()[1]
logger.warning(f"Port {starting_port} is already in use. Using port {new_port} instead.")
return new_port
def run_server():
# Parse configuration
port = int(os.environ.get('OPENEDAI_PORT', shared.args.api_port))
port = find_available_port(port)
ssl_certfile = os.environ.get('OPENEDAI_CERT_PATH', shared.args.ssl_certfile)
ssl_keyfile = os.environ.get('OPENEDAI_KEY_PATH', shared.args.ssl_keyfile)

View file

@ -42,6 +42,7 @@ class GenerationOptions(BaseModel):
auto_max_new_tokens: bool = False
ban_eos_token: bool = False
add_bos_token: bool = True
enable_thinking: bool = True
skip_special_tokens: bool = True
static_cache: bool = False
truncation_length: int = 0

View file

@ -31,13 +31,13 @@ function removeLastClick() {
}
function handleMorphdomUpdate(text) {
// Track closed blocks
const closedBlocks = new Set();
// Track open blocks
const openBlocks = new Set();
document.querySelectorAll(".thinking-block").forEach(block => {
const blockId = block.getAttribute("data-block-id");
// If block exists and is not open, add to closed set
if (blockId && !block.hasAttribute("open")) {
closedBlocks.add(blockId);
// If block exists and is open, add to open set
if (blockId && block.hasAttribute("open")) {
openBlocks.add(blockId);
}
});
@ -72,13 +72,15 @@ function handleMorphdomUpdate(text) {
}
}
// For thinking blocks, respect closed state
// For thinking blocks, assume closed by default
if (fromEl.classList && fromEl.classList.contains("thinking-block") &&
toEl.classList && toEl.classList.contains("thinking-block")) {
const blockId = toEl.getAttribute("data-block-id");
// If this block was closed by user, keep it closed
if (blockId && closedBlocks.has(blockId)) {
// Remove open attribute by default
toEl.removeAttribute("open");
// If this block was explicitly opened by user, keep it open
if (blockId && openBlocks.has(blockId)) {
toEl.setAttribute("open", "");
}
}

View file

@ -90,6 +90,44 @@ def get_generation_prompt(renderer, impersonate=False, strip_trailing_spaces=Tru
return prefix, suffix
def get_thinking_suppression_string(template):
"""
Determines what string needs to be added to suppress thinking mode
by comparing template renderings with thinking enabled vs disabled.
"""
# Render with thinking enabled
with_thinking = template.render(
messages=[{'role': 'user', 'content': ''}],
builtin_tools=None,
tools=None,
tools_in_user_message=False,
add_generation_prompt=True,
enable_thinking=True
)
# Render with thinking disabled
without_thinking = template.render(
messages=[{'role': 'user', 'content': ''}],
builtin_tools=None,
tools=None,
tools_in_user_message=False,
add_generation_prompt=True,
enable_thinking=False
)
# Find the difference (what gets added to suppress thinking)
i = 0
while i < min(len(with_thinking), len(without_thinking)) and with_thinking[i] == without_thinking[i]:
i += 1
j = 0
while j < min(len(with_thinking), len(without_thinking)) - i and with_thinking[-1 - j] == without_thinking[-1 - j]:
j += 1
return without_thinking[i:len(without_thinking) - j if j else None]
def generate_chat_prompt(user_input, state, **kwargs):
impersonate = kwargs.get('impersonate', False)
_continue = kwargs.get('_continue', False)
@ -147,13 +185,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
if user_input and not impersonate and not _continue:
messages.append({"role": "user", "content": user_input})
def remove_extra_bos(prompt):
for bos_token in ['<s>', '<|startoftext|>', '<BOS_TOKEN>', '<|endoftext|>']:
while prompt.startswith(bos_token):
prompt = prompt[len(bos_token):]
return prompt
def make_prompt(messages):
if state['mode'] == 'chat-instruct' and _continue:
prompt = renderer(messages=messages[:-1])
@ -165,7 +196,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
if state['custom_system_message'].strip() != '':
outer_messages.append({"role": "system", "content": state['custom_system_message']})
prompt = remove_extra_bos(prompt)
command = state['chat-instruct_command']
command = command.replace('<|character|>', state['name2'] if not impersonate else state['name1'])
command = command.replace('<|prompt|>', prompt)
@ -182,11 +212,10 @@ def generate_chat_prompt(user_input, state, **kwargs):
outer_messages.append({"role": "user", "content": command})
outer_messages.append({"role": "assistant", "content": prefix})
prompt = instruction_template.render(messages=outer_messages)
prompt = instruct_renderer(messages=outer_messages)
suffix = get_generation_prompt(instruct_renderer, impersonate=False)[1]
if len(suffix) > 0:
prompt = prompt[:-len(suffix)]
else:
if _continue:
suffix = get_generation_prompt(renderer, impersonate=impersonate)[1]
@ -199,7 +228,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
prompt += prefix
prompt = remove_extra_bos(prompt)
if state['mode'] == 'instruct' and not any((_continue, impersonate, state['enable_thinking'])):
prompt += get_thinking_suppression_string(instruction_template)
return prompt
prompt = make_prompt(messages)

View file

@ -65,7 +65,7 @@ class Exllamav2HF(PreTrainedModel, GenerationMixin):
elif kv_cache_type == 'q4':
cache_type = ExLlamaV2Cache_Q4
else:
raise ValueError(f"Invalid cache type for ExLlamaV2: {cache_type}. Valid options are: fp16, fp8, q8, q6, q4.")
raise ValueError(f"Invalid cache type for ExLlamaV2: {kv_cache_type}. Valid options are: fp16, fp8, q8, q6, q4.")
# Use TP if specified
if shared.args.enable_tp:
@ -78,12 +78,10 @@ class Exllamav2HF(PreTrainedModel, GenerationMixin):
self.past_seq = None
if shared.args.cfg_cache:
if shared.args.cache_8bit:
self.ex_cache_negative = ExLlamaV2Cache_8bit(self.ex_model)
elif shared.args.cache_4bit:
self.ex_cache_negative = ExLlamaV2Cache_Q4(self.ex_model)
if shared.args.enable_tp:
self.ex_cache_negative = ExLlamaV2Cache_TP(self.ex_model, base=cache_type)
else:
self.ex_cache_negative = ExLlamaV2Cache(self.ex_model)
self.ex_cache_negative = cache_type(self.ex_model, lazy=shared.args.autosplit)
self.past_seq_negative = None

View file

@ -118,6 +118,9 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
seq_tensor = torch.tensor(seq)
reset = True
# Maximum number of tokens to process in a single forward pass
max_chunk_size = 2048
# Make the forward call
if labels is None:
if past_seq is not None:
@ -131,55 +134,85 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
if longest_prefix > 0:
reset = False
current_len = longest_prefix
if len(seq_tensor) - longest_prefix > 1:
remaining_tokens = len(seq_tensor) - longest_prefix - 1
if remaining_tokens > 0:
# Process tokens from longest_prefix to second-to-last token
tokens_to_process = seq_tensor[longest_prefix:-1]
# Process in chunks if the number of tokens is large
for i in range(0, tokens_to_process.shape[0], max_chunk_size):
chunk = tokens_to_process[i:i + max_chunk_size]
self.ex_model.forward(
input_ids=seq_tensor[longest_prefix:-1].view(1, -1),
input_ids=chunk.view(1, -1),
params={
"attn_mode": "flash_attn",
"cache": ex_cache,
"past_len": longest_prefix,
"batch_shape": (1, self.max_tokens)
"past_len": longest_prefix + i,
"batch_shape": (1, self.max_tokens),
"reconstruct": False # Force memory-efficient path
}
)
current_len = longest_prefix + len(seq_tensor) - longest_prefix - 1
current_len = longest_prefix + remaining_tokens
if reset:
if len(seq_tensor) > 1:
# Process all tokens except the last one
tokens_to_process = seq_tensor[:-1]
# Process in chunks if the number of tokens is large
current_len = 0
for i in range(0, tokens_to_process.shape[0], max_chunk_size):
chunk = tokens_to_process[i:i + max_chunk_size]
self.ex_model.forward(
input_ids=seq_tensor[:-1].view(1, -1),
input_ids=chunk.view(1, -1),
params={
"attn_mode": "flash_attn",
"cache": ex_cache,
"past_len": 0,
"batch_shape": (1, self.max_tokens)
"past_len": current_len,
"batch_shape": (1, self.max_tokens),
"reconstruct": False # Force memory-efficient path
}
)
current_len = len(seq_tensor) - 1
current_len += chunk.shape[0]
else:
current_len = 0
# Process the last token and get logits
logits = self.ex_model.forward(
input_ids=seq_tensor[-1:].view(1, -1),
params={
"attn_mode": "flash_attn",
"cache": ex_cache,
"past_len": current_len,
"batch_shape": (1, self.max_tokens)
"batch_shape": (1, self.max_tokens),
"reconstruct": False # Force memory-efficient path
}
).to(input_ids.device).float()
else:
logits = self.ex_model.forward(
input_ids=seq_tensor.view(1, -1),
# When processing with labels, handle as a complete sequence
# Process in chunks if the number of tokens is large
tokens_to_process = seq_tensor
all_logits = None
for i in range(0, tokens_to_process.shape[0], max_chunk_size):
chunk = tokens_to_process[i:i + max_chunk_size]
chunk_logits = self.ex_model.forward(
input_ids=chunk.view(1, -1),
params={
"attn_mode": "flash_attn",
"cache": ex_cache,
"past_len": 0,
"batch_shape": (1, self.max_tokens)
"attn_mode": "flash_attn_nc", # No caching for training
"reconstruct": False # Force memory-efficient path
}
).float()
if all_logits is None:
all_logits = chunk_logits
else:
all_logits = torch.cat([all_logits, chunk_logits], dim=1)
logits = all_logits
if is_negative:
self.past_seq_negative = seq_tensor
else:

View file

@ -167,7 +167,7 @@ def convert_to_markdown(string, message_id=None):
title_text = "Thinking..." if is_streaming else "Thought"
thinking_block = f'''
<details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}" open>
<details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
<summary class="thinking-header">
<svg class="thinking-icon" width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
<path d="M8 1.33334C4.31868 1.33334 1.33334 4.31868 1.33334 8.00001C1.33334 11.6813 4.31868 14.6667 8 14.6667C11.6813 14.6667 14.6667 11.6813 14.6667 8.00001C14.6667 4.31868 11.6813 1.33334 8 1.33334Z" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>

View file

@ -135,6 +135,7 @@ class LlamaServer:
"prompt": token_ids,
"n_predict": max_new_tokens,
"stream": True,
"cache_prompt": True
})
if shared.args.verbose:
@ -327,6 +328,11 @@ class LlamaServer:
else:
env['LD_LIBRARY_PATH'] = os.path.dirname(self.server_path)
if shared.args.verbose:
logger.info("llama-server command-line flags:")
print(' '.join(str(item) for item in cmd[1:]))
print()
# Start the server with pipes for output
self.process = subprocess.Popen(
cmd,
@ -340,9 +346,7 @@ class LlamaServer:
# Wait for server to be healthy
health_url = f"http://127.0.0.1:{self.port}/health"
start_time = time.time()
timeout = 3600 * 8 # 8 hours
while time.time() - start_time < timeout:
while True:
# Check if process is still alive
if self.process.poll() is not None:
# Process has terminated
@ -357,8 +361,6 @@ class LlamaServer:
pass
time.sleep(1)
else:
raise TimeoutError(f"Server health check timed out after {timeout} seconds")
# Server is now healthy, get model info
self._get_vocabulary_size()

View file

@ -143,6 +143,7 @@ def transformers_samplers():
'auto_max_new_tokens',
'ban_eos_token',
'add_bos_token',
'enable_thinking',
'skip_special_tokens',
'static_cache',
'seed',
@ -195,6 +196,7 @@ loaders_samplers = {
'auto_max_new_tokens',
'ban_eos_token',
'add_bos_token',
'enable_thinking',
'skip_special_tokens',
'seed',
'sampler_priority',
@ -241,6 +243,7 @@ loaders_samplers = {
'auto_max_new_tokens',
'ban_eos_token',
'add_bos_token',
'enable_thinking',
'skip_special_tokens',
'seed',
'sampler_priority',
@ -279,6 +282,7 @@ loaders_samplers = {
'auto_max_new_tokens',
'ban_eos_token',
'add_bos_token',
'enable_thinking',
'skip_special_tokens',
'seed',
'custom_token_bans',
@ -311,6 +315,7 @@ loaders_samplers = {
'auto_max_new_tokens',
'ban_eos_token',
'add_bos_token',
'enable_thinking',
'seed',
'sampler_priority',
'dry_sequence_breakers',

View file

@ -45,6 +45,9 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
output = {}
for entry in logprobs:
token = repr(entry['token'])
if len(token) > 2 and token.startswith("'") and token.endswith("'"):
token = token[1:-1]
prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
output[token] = prob
return output
@ -52,6 +55,9 @@ def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, retur
output = ''
for entry in logprobs:
token = repr(entry['token'])
if len(token) > 2 and token.startswith("'") and token.endswith("'"):
token = token[1:-1]
prob = entry['prob'] if use_samplers else np.exp(entry['logprob'])
output += f"{prob:.5f} - {token}\n"
return output, previous

View file

@ -47,10 +47,11 @@ settings = {
'max_new_tokens_max': 4096,
'prompt_lookup_num_tokens': 0,
'max_tokens_second': 0,
'max_updates_second': 0,
'max_updates_second': 12,
'auto_max_new_tokens': True,
'ban_eos_token': False,
'add_bos_token': True,
'enable_thinking': True,
'skip_special_tokens': True,
'stream': True,
'static_cache': False,

View file

@ -100,8 +100,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
last_update = cur_time
yield reply
yield reply
if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything):
break

View file

@ -114,6 +114,7 @@ def list_model_elements():
'cache_type',
'tensor_split',
'extra_flags',
'streaming_llm',
'gpu_split',
'alpha_value',
'rope_freq_base',
@ -198,6 +199,7 @@ def list_interface_input_elements():
'auto_max_new_tokens',
'ban_eos_token',
'add_bos_token',
'enable_thinking',
'skip_special_tokens',
'stream',
'static_cache',

View file

@ -51,7 +51,7 @@ def create_ui():
shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
shared.gradio['ctx_size'] = gr.Number(label='ctx_size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
@ -96,7 +96,7 @@ def create_ui():
# Speculative decoding
with gr.Accordion("Speculative decoding", open=False, elem_classes='tgw-accordion') as shared.gradio['speculative_decoding_accordion']:
with gr.Row():
shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', interactive=not mu)
shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu)
ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')

View file

@ -82,6 +82,7 @@ def create_ui(default_preset):
shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='enable_thinking', info='Used by Qwen3 to toggle <think> mode.')
shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')

View file

@ -30,10 +30,10 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"

View file

@ -29,6 +29,6 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -29,6 +29,6 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"

View file

@ -29,7 +29,7 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -29,8 +29,8 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4-py3-none-any.whl
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl

View file

@ -29,5 +29,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -29,5 +29,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"

View file

@ -30,10 +30,10 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a4/exllamav3-0.0.1a4+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,4 +15,4 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,4 +15,4 @@ sse-starlette==1.6.5
tiktoken
# AMD wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"

View file

@ -15,6 +15,6 @@ sse-starlette==1.6.5
tiktoken
# Mac wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# llama.cpp (CPU only, no AVX2)
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -15,5 +15,5 @@ sse-starlette==1.6.5
tiktoken
# CUDA wheels
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.6.0/llama_cpp_binaries-0.6.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View file

@ -18,10 +18,11 @@ max_new_tokens_min: 1
max_new_tokens_max: 4096
prompt_lookup_num_tokens: 0
max_tokens_second: 0
max_updates_second: 0
max_updates_second: 12
auto_max_new_tokens: true
ban_eos_token: false
add_bos_token: true
enable_thinking: true
skip_special_tokens: true
stream: true
static_cache: false