mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-06-09 07:07:16 -04:00
llama.cpp: Add --extra-flags parameter for passing additional flags to llama-server
This commit is contained in:
parent
b6fffbd216
commit
98f4c694b9
5 changed files with 18 additions and 0 deletions
|
@ -301,6 +301,20 @@ class LlamaServer:
|
|||
cmd += ["--device-draft", shared.args.device_draft]
|
||||
if shared.args.ctx_size_draft > 0:
|
||||
cmd += ["--ctx-size-draft", str(shared.args.ctx_size_draft)]
|
||||
if shared.args.extra_flags:
|
||||
# Clean up the input
|
||||
extra_flags = shared.args.extra_flags.strip()
|
||||
if extra_flags.startswith('"') and extra_flags.endswith('"'):
|
||||
extra_flags = extra_flags[1:-1].strip()
|
||||
elif extra_flags.startswith("'") and extra_flags.endswith("'"):
|
||||
extra_flags = extra_flags[1:-1].strip()
|
||||
|
||||
for flag_item in extra_flags.split(';'):
|
||||
if '=' in flag_item:
|
||||
flag, value = flag_item.split('=', 1)
|
||||
cmd += [f"--{flag}", value]
|
||||
else:
|
||||
cmd.append(f"--{flag_item}")
|
||||
|
||||
env = os.environ.copy()
|
||||
if os.name == 'posix':
|
||||
|
|
|
@ -12,6 +12,7 @@ loaders_and_params = OrderedDict({
|
|||
'n_ctx',
|
||||
'cache_type',
|
||||
'tensor_split',
|
||||
'extra_flags',
|
||||
'rope_freq_base',
|
||||
'compress_pos_emb',
|
||||
'flash_attn',
|
||||
|
|
|
@ -128,6 +128,7 @@ group.add_argument('--tensor-split', type=str, default=None, help='Split the mod
|
|||
group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
|
||||
group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.')
|
||||
group.add_argument('--row-split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.')
|
||||
group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"')
|
||||
|
||||
# Speculative decoding
|
||||
group = parser.add_argument_group('Speculative decoding')
|
||||
|
|
|
@ -114,6 +114,7 @@ def list_model_elements():
|
|||
'max_seq_len',
|
||||
'cache_type',
|
||||
'tensor_split',
|
||||
'extra_flags',
|
||||
'gpu_split',
|
||||
'alpha_value',
|
||||
'rope_freq_base',
|
||||
|
|
|
@ -56,6 +56,7 @@ def create_ui():
|
|||
shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q6', 'q4'], value=shared.args.cache_type, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
|
||||
shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
|
||||
shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
|
||||
shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"')
|
||||
shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
|
||||
shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
|
||||
shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
|
||||
|
|
Loading…
Add table
Reference in a new issue