diff --git a/modules/models_settings.py b/modules/models_settings.py index a8e17594..6ea6660c 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -213,24 +213,26 @@ def apply_model_settings_to_state(model, state): model_settings = get_model_metadata(model) if 'loader' in model_settings: loader = model_settings.pop('loader') - - # If the user is using an alternative loader for the same model type, let them keep using it if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']): state['loader'] = loader for k in model_settings: - if k in state: - if k == 'gpu_layers': - available_vram = get_nvidia_free_vram() - n_layers = model_settings[k] - if available_vram > 0: - tolerance = 906 - while n_layers > 0 and estimate_vram(model, n_layers, state['ctx_size'], state['cache_type']) > available_vram - tolerance: - n_layers -= 1 + if k in state and k != 'gpu_layers': # Skip gpu_layers, handle separately + state[k] = model_settings[k] - state[k] = gr.update(value=n_layers, maximum=model_settings[k]) - else: - state[k] = model_settings[k] + # Handle GPU layers and VRAM update for llama.cpp + if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings: + vram_info, gpu_layers_update = update_gpu_layers_and_vram( + state['loader'], + model, + model_settings['gpu_layers'], + state['ctx_size'], + state['cache_type'], + auto_adjust=True + ) + + state['gpu_layers'] = gpu_layers_update + state['vram_info'] = vram_info return state @@ -426,3 +428,53 @@ def get_nvidia_free_vram(): raise # Handle any other unexpected exceptions return -1 + + +def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True): + """ + Unified function to handle GPU layers and VRAM updates. + + Args: + for_ui: If True, returns Gradio updates. If False, returns raw values. + + Returns: + - If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update + - If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage + """ + if loader != 'llama.cpp' or model in ["None", None]: + vram_info = "
Estimated VRAM to load the model:" + if for_ui: + return (vram_info, gr.update()) if auto_adjust else vram_info + else: + return (0, gpu_layers) if auto_adjust else 0 + + current_layers = gpu_layers + max_layers = gpu_layers + + if auto_adjust: + # Get max layers from model metadata + model_settings = get_model_metadata(model) + max_layers = model_settings.get('gpu_layers', gpu_layers) + + # Auto-adjust based on available VRAM + available_vram = get_nvidia_free_vram() + if available_vram > 0: + tolerance = 906 + current_layers = max_layers + while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance: + current_layers -= 1 + + # Calculate VRAM with current layers + vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type) + + if for_ui: + vram_info = f"
Estimated VRAM to load the model: {vram_usage:.0f} MiB" + if auto_adjust: + return vram_info, gr.update(value=current_layers, maximum=max_layers) + else: + return vram_info + else: + if auto_adjust: + return vram_usage, current_layers + else: + return vram_usage diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index a1911124..b6febb50 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -11,10 +11,10 @@ from modules.LoRA import add_lora_to_model from modules.models import load_model, unload_model from modules.models_settings import ( apply_model_settings_to_state, - estimate_vram, get_model_metadata, save_instruction_template, save_model_settings, + update_gpu_layers_and_vram, update_model_parameters ) from modules.utils import gradio @@ -45,7 +45,7 @@ def create_ui(): shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) with gr.Column(): - shared.gradio['vram_info'] = gr.HTML(value=lambda: estimate_vram_wrapper(shared.args.model, shared.args.gpu_layers, shared.args.ctx_size, shared.args.cache_type)) + shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info()) shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) @@ -150,10 +150,18 @@ def create_event_handlers(): ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False) - shared.gradio['model_menu'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) - shared.gradio['gpu_layers'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) - shared.gradio['ctx_size'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) - shared.gradio['cache_type'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False) + # For ctx_size and cache_type - auto-adjust GPU layers + for param in ['ctx_size', 'cache_type']: + shared.gradio[param].change( + partial(update_gpu_layers_and_vram, auto_adjust=True), + gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), + gradio('vram_info', 'gpu_layers'), show_progress=False) + + # For manual gpu_layers changes - only update VRAM + shared.gradio['gpu_layers'].change( + partial(update_gpu_layers_and_vram, auto_adjust=False), + gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), + gradio('vram_info'), show_progress=False) if not shared.args.portable: shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False) @@ -282,14 +290,6 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur yield traceback.format_exc().replace('\n', '\n\n') -def estimate_vram_wrapper(model, gpu_layers, ctx_size, cache_type): - if model in ["None", None]: - return "
Estimated VRAM to load the model:" - - result = estimate_vram(model, gpu_layers, ctx_size, cache_type) - return f"
Estimated VRAM to load the model: {result:.0f} MiB" - - def update_truncation_length(current_length, state): if 'loader' in state: if state['loader'].lower().startswith('exllama') or state['loader'] == 'llama.cpp': @@ -298,10 +298,26 @@ def update_truncation_length(current_length, state): return current_length +def get_initial_vram_info(): + if shared.model_name != 'None' and shared.args.loader == 'llama.cpp': + return update_gpu_layers_and_vram( + shared.args.loader, + shared.model_name, + shared.args.gpu_layers, + shared.args.ctx_size, + shared.args.cache_type, + auto_adjust=False, + for_ui=True + ) + + return "
Estimated VRAM to load the model:" + + def handle_load_model_event_initial(model, state): state = apply_model_settings_to_state(model, state) output = ui.apply_interface_values(state) - update_model_parameters(state) + update_model_parameters(state) # This updates the command-line flags + return output + [state] diff --git a/server.py b/server.py index c35d65a8..c22ed1f1 100644 --- a/server.py +++ b/server.py @@ -49,10 +49,9 @@ from modules.extensions import apply_extensions from modules.LoRA import add_lora_to_model from modules.models import load_model, unload_model_if_idle from modules.models_settings import ( - estimate_vram, get_fallback_settings, get_model_metadata, - get_nvidia_free_vram, + update_gpu_layers_and_vram, update_model_parameters ) from modules.shared import do_cmd_flags_warnings @@ -250,15 +249,19 @@ if __name__ == "__main__": model_settings = get_model_metadata(model_name) update_model_parameters(model_settings, initial=True) # hijack the command-line arguments - if 'gpu_layers' not in shared.provided_arguments: - available_vram = get_nvidia_free_vram() - if available_vram > 0: - n_layers = model_settings['gpu_layers'] - tolerance = 906 - while n_layers > 0 and estimate_vram(model_name, n_layers, shared.args.ctx_size, shared.args.cache_type) > available_vram - tolerance: - n_layers -= 1 + # Auto-adjust GPU layers if not provided by user and it's a llama.cpp model + if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings: + vram_usage, adjusted_layers = update_gpu_layers_and_vram( + shared.args.loader, + model_name, + model_settings['gpu_layers'], + shared.args.ctx_size, + shared.args.cache_type, + auto_adjust=True, + for_ui=False + ) - shared.args.gpu_layers = n_layers + shared.args.gpu_layers = adjusted_layers # Load the model shared.model, shared.tokenizer = load_model(model_name)