From 3fa1a899aea3ff2700a20a8bc2da17202d3065e5 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 13 May 2025 12:07:59 -0700 Subject: [PATCH] UI: Fix gpu-layers being ignored (closes #6973) --- modules/loaders.py | 2 +- modules/models_settings.py | 2 +- modules/ui.py | 2 +- modules/ui_model_menu.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/loaders.py b/modules/loaders.py index 4b76549b..583b65c2 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -5,7 +5,7 @@ import gradio as gr loaders_and_params = OrderedDict({ 'llama.cpp': [ - 'n_gpu_layers', + 'gpu_layers', 'threads', 'threads_batch', 'batch_size', diff --git a/modules/models_settings.py b/modules/models_settings.py index ae589bb3..4418e3fb 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -67,7 +67,7 @@ def get_model_metadata(model): elif k.endswith('rope.scaling.factor'): model_settings['compress_pos_emb'] = metadata[k] elif k.endswith('block_count'): - model_settings['n_gpu_layers'] = metadata[k] + 1 + model_settings['gpu_layers'] = metadata[k] + 1 if 'tokenizer.chat_template' in metadata: template = metadata['tokenizer.chat_template'] diff --git a/modules/ui.py b/modules/ui.py index b3d4bccf..eeb6ce92 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -105,7 +105,7 @@ def list_model_elements(): 'filter_by_loader', 'loader', 'cpu_memory', - 'n_gpu_layers', + 'gpu_layers', 'threads', 'threads_batch', 'batch_size', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 1e27255b..b63a127c 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -50,7 +50,7 @@ def create_ui(): gr.Markdown("## Main options") with gr.Row(): with gr.Column(): - shared.gradio['n_gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') + shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=256, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')