Estimated VRAM to load the model:"
+ if for_ui:
+ return (vram_info, gr.update()) if auto_adjust else vram_info
+ else:
+ return (0, gpu_layers) if auto_adjust else 0
+
+ current_layers = gpu_layers
+ max_layers = gpu_layers
+
+ if auto_adjust:
+ # Get max layers from model metadata
+ model_settings = get_model_metadata(model)
+ max_layers = model_settings.get('gpu_layers', gpu_layers)
+
+ # Auto-adjust based on available VRAM
+ available_vram = get_nvidia_free_vram()
+ if available_vram > 0:
+ tolerance = 906
+ current_layers = max_layers
+ while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
+ current_layers -= 1
+
+ # Calculate VRAM with current layers
+ vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type)
+
+ if for_ui:
+ vram_info = f"
Estimated VRAM to load the model:
{vram_usage:.0f} MiB"
+ if auto_adjust:
+ return vram_info, gr.update(value=current_layers, maximum=max_layers)
+ else:
+ return vram_info
+ else:
+ if auto_adjust:
+ return vram_usage, current_layers
+ else:
+ return vram_usage
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index a1911124..b6febb50 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -11,10 +11,10 @@ from modules.LoRA import add_lora_to_model
from modules.models import load_model, unload_model
from modules.models_settings import (
apply_model_settings_to_state,
- estimate_vram,
get_model_metadata,
save_instruction_template,
save_model_settings,
+ update_gpu_layers_and_vram,
update_model_parameters
)
from modules.utils import gradio
@@ -45,7 +45,7 @@ def create_ui():
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
with gr.Column():
- shared.gradio['vram_info'] = gr.HTML(value=lambda: estimate_vram_wrapper(shared.args.model, shared.args.gpu_layers, shared.args.ctx_size, shared.args.cache_type))
+ shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
@@ -150,10 +150,18 @@ def create_event_handlers():
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
- shared.gradio['model_menu'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
- shared.gradio['gpu_layers'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
- shared.gradio['ctx_size'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
- shared.gradio['cache_type'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
+ # For ctx_size and cache_type - auto-adjust GPU layers
+ for param in ['ctx_size', 'cache_type']:
+ shared.gradio[param].change(
+ partial(update_gpu_layers_and_vram, auto_adjust=True),
+ gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
+ gradio('vram_info', 'gpu_layers'), show_progress=False)
+
+ # For manual gpu_layers changes - only update VRAM
+ shared.gradio['gpu_layers'].change(
+ partial(update_gpu_layers_and_vram, auto_adjust=False),
+ gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
+ gradio('vram_info'), show_progress=False)
if not shared.args.portable:
shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
@@ -282,14 +290,6 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
yield traceback.format_exc().replace('\n', '\n\n')
-def estimate_vram_wrapper(model, gpu_layers, ctx_size, cache_type):
- if model in ["None", None]:
- return "
Estimated VRAM to load the model:"
-
- result = estimate_vram(model, gpu_layers, ctx_size, cache_type)
- return f"
Estimated VRAM to load the model:
{result:.0f} MiB"
-
-
def update_truncation_length(current_length, state):
if 'loader' in state:
if state['loader'].lower().startswith('exllama') or state['loader'] == 'llama.cpp':
@@ -298,10 +298,26 @@ def update_truncation_length(current_length, state):
return current_length
+def get_initial_vram_info():
+ if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
+ return update_gpu_layers_and_vram(
+ shared.args.loader,
+ shared.model_name,
+ shared.args.gpu_layers,
+ shared.args.ctx_size,
+ shared.args.cache_type,
+ auto_adjust=False,
+ for_ui=True
+ )
+
+ return "
Estimated VRAM to load the model:"
+
+
def handle_load_model_event_initial(model, state):
state = apply_model_settings_to_state(model, state)
output = ui.apply_interface_values(state)
- update_model_parameters(state)
+ update_model_parameters(state) # This updates the command-line flags
+
return output + [state]
diff --git a/server.py b/server.py
index c35d65a8..c22ed1f1 100644
--- a/server.py
+++ b/server.py
@@ -49,10 +49,9 @@ from modules.extensions import apply_extensions
from modules.LoRA import add_lora_to_model
from modules.models import load_model, unload_model_if_idle
from modules.models_settings import (
- estimate_vram,
get_fallback_settings,
get_model_metadata,
- get_nvidia_free_vram,
+ update_gpu_layers_and_vram,
update_model_parameters
)
from modules.shared import do_cmd_flags_warnings
@@ -250,15 +249,19 @@ if __name__ == "__main__":
model_settings = get_model_metadata(model_name)
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
- if 'gpu_layers' not in shared.provided_arguments:
- available_vram = get_nvidia_free_vram()
- if available_vram > 0:
- n_layers = model_settings['gpu_layers']
- tolerance = 906
- while n_layers > 0 and estimate_vram(model_name, n_layers, shared.args.ctx_size, shared.args.cache_type) > available_vram - tolerance:
- n_layers -= 1
+ # Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
+ if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
+ vram_usage, adjusted_layers = update_gpu_layers_and_vram(
+ shared.args.loader,
+ model_name,
+ model_settings['gpu_layers'],
+ shared.args.ctx_size,
+ shared.args.cache_type,
+ auto_adjust=True,
+ for_ui=False
+ )
- shared.args.gpu_layers = n_layers
+ shared.args.gpu_layers = adjusted_layers
# Load the model
shared.model, shared.tokenizer = load_model(model_name)