mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-06-07 06:06:20 -04:00
Auto-adjust GPU layers on context size and cache type changes + many fixes
This commit is contained in:
parent
93e1850a2c
commit
4925c307cf
3 changed files with 109 additions and 38 deletions
|
@ -213,25 +213,27 @@ def apply_model_settings_to_state(model, state):
|
|||
model_settings = get_model_metadata(model)
|
||||
if 'loader' in model_settings:
|
||||
loader = model_settings.pop('loader')
|
||||
|
||||
# If the user is using an alternative loader for the same model type, let them keep using it
|
||||
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
|
||||
state['loader'] = loader
|
||||
|
||||
for k in model_settings:
|
||||
if k in state:
|
||||
if k == 'gpu_layers':
|
||||
available_vram = get_nvidia_free_vram()
|
||||
n_layers = model_settings[k]
|
||||
if available_vram > 0:
|
||||
tolerance = 906
|
||||
while n_layers > 0 and estimate_vram(model, n_layers, state['ctx_size'], state['cache_type']) > available_vram - tolerance:
|
||||
n_layers -= 1
|
||||
|
||||
state[k] = gr.update(value=n_layers, maximum=model_settings[k])
|
||||
else:
|
||||
if k in state and k != 'gpu_layers': # Skip gpu_layers, handle separately
|
||||
state[k] = model_settings[k]
|
||||
|
||||
# Handle GPU layers and VRAM update for llama.cpp
|
||||
if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings:
|
||||
vram_info, gpu_layers_update = update_gpu_layers_and_vram(
|
||||
state['loader'],
|
||||
model,
|
||||
model_settings['gpu_layers'],
|
||||
state['ctx_size'],
|
||||
state['cache_type'],
|
||||
auto_adjust=True
|
||||
)
|
||||
|
||||
state['gpu_layers'] = gpu_layers_update
|
||||
state['vram_info'] = vram_info
|
||||
|
||||
return state
|
||||
|
||||
|
||||
|
@ -426,3 +428,53 @@ def get_nvidia_free_vram():
|
|||
raise
|
||||
# Handle any other unexpected exceptions
|
||||
return -1
|
||||
|
||||
|
||||
def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True):
|
||||
"""
|
||||
Unified function to handle GPU layers and VRAM updates.
|
||||
|
||||
Args:
|
||||
for_ui: If True, returns Gradio updates. If False, returns raw values.
|
||||
|
||||
Returns:
|
||||
- If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update
|
||||
- If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage
|
||||
"""
|
||||
if loader != 'llama.cpp' or model in ["None", None]:
|
||||
vram_info = "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
|
||||
if for_ui:
|
||||
return (vram_info, gr.update()) if auto_adjust else vram_info
|
||||
else:
|
||||
return (0, gpu_layers) if auto_adjust else 0
|
||||
|
||||
current_layers = gpu_layers
|
||||
max_layers = gpu_layers
|
||||
|
||||
if auto_adjust:
|
||||
# Get max layers from model metadata
|
||||
model_settings = get_model_metadata(model)
|
||||
max_layers = model_settings.get('gpu_layers', gpu_layers)
|
||||
|
||||
# Auto-adjust based on available VRAM
|
||||
available_vram = get_nvidia_free_vram()
|
||||
if available_vram > 0:
|
||||
tolerance = 906
|
||||
current_layers = max_layers
|
||||
while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
|
||||
current_layers -= 1
|
||||
|
||||
# Calculate VRAM with current layers
|
||||
vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type)
|
||||
|
||||
if for_ui:
|
||||
vram_info = f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{vram_usage:.0f} MiB</span>"
|
||||
if auto_adjust:
|
||||
return vram_info, gr.update(value=current_layers, maximum=max_layers)
|
||||
else:
|
||||
return vram_info
|
||||
else:
|
||||
if auto_adjust:
|
||||
return vram_usage, current_layers
|
||||
else:
|
||||
return vram_usage
|
||||
|
|
|
@ -11,10 +11,10 @@ from modules.LoRA import add_lora_to_model
|
|||
from modules.models import load_model, unload_model
|
||||
from modules.models_settings import (
|
||||
apply_model_settings_to_state,
|
||||
estimate_vram,
|
||||
get_model_metadata,
|
||||
save_instruction_template,
|
||||
save_model_settings,
|
||||
update_gpu_layers_and_vram,
|
||||
update_model_parameters
|
||||
)
|
||||
from modules.utils import gradio
|
||||
|
@ -45,7 +45,7 @@ def create_ui():
|
|||
shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
|
||||
|
||||
with gr.Column():
|
||||
shared.gradio['vram_info'] = gr.HTML(value=lambda: estimate_vram_wrapper(shared.args.model, shared.args.gpu_layers, shared.args.ctx_size, shared.args.cache_type))
|
||||
shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
|
||||
shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
|
||||
shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
|
||||
shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
|
||||
|
@ -150,10 +150,18 @@ def create_event_handlers():
|
|||
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
|
||||
save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
|
||||
|
||||
shared.gradio['model_menu'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
|
||||
shared.gradio['gpu_layers'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
|
||||
shared.gradio['ctx_size'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
|
||||
shared.gradio['cache_type'].change(estimate_vram_wrapper, gradio('model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info'), show_progress=False)
|
||||
# For ctx_size and cache_type - auto-adjust GPU layers
|
||||
for param in ['ctx_size', 'cache_type']:
|
||||
shared.gradio[param].change(
|
||||
partial(update_gpu_layers_and_vram, auto_adjust=True),
|
||||
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
|
||||
gradio('vram_info', 'gpu_layers'), show_progress=False)
|
||||
|
||||
# For manual gpu_layers changes - only update VRAM
|
||||
shared.gradio['gpu_layers'].change(
|
||||
partial(update_gpu_layers_and_vram, auto_adjust=False),
|
||||
gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
|
||||
gradio('vram_info'), show_progress=False)
|
||||
|
||||
if not shared.args.portable:
|
||||
shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
|
||||
|
@ -282,14 +290,6 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
|
|||
yield traceback.format_exc().replace('\n', '\n\n')
|
||||
|
||||
|
||||
def estimate_vram_wrapper(model, gpu_layers, ctx_size, cache_type):
|
||||
if model in ["None", None]:
|
||||
return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
|
||||
|
||||
result = estimate_vram(model, gpu_layers, ctx_size, cache_type)
|
||||
return f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{result:.0f} MiB</span>"
|
||||
|
||||
|
||||
def update_truncation_length(current_length, state):
|
||||
if 'loader' in state:
|
||||
if state['loader'].lower().startswith('exllama') or state['loader'] == 'llama.cpp':
|
||||
|
@ -298,10 +298,26 @@ def update_truncation_length(current_length, state):
|
|||
return current_length
|
||||
|
||||
|
||||
def get_initial_vram_info():
|
||||
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
|
||||
return update_gpu_layers_and_vram(
|
||||
shared.args.loader,
|
||||
shared.model_name,
|
||||
shared.args.gpu_layers,
|
||||
shared.args.ctx_size,
|
||||
shared.args.cache_type,
|
||||
auto_adjust=False,
|
||||
for_ui=True
|
||||
)
|
||||
|
||||
return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</span>"
|
||||
|
||||
|
||||
def handle_load_model_event_initial(model, state):
|
||||
state = apply_model_settings_to_state(model, state)
|
||||
output = ui.apply_interface_values(state)
|
||||
update_model_parameters(state)
|
||||
update_model_parameters(state) # This updates the command-line flags
|
||||
|
||||
return output + [state]
|
||||
|
||||
|
||||
|
|
23
server.py
23
server.py
|
@ -49,10 +49,9 @@ from modules.extensions import apply_extensions
|
|||
from modules.LoRA import add_lora_to_model
|
||||
from modules.models import load_model, unload_model_if_idle
|
||||
from modules.models_settings import (
|
||||
estimate_vram,
|
||||
get_fallback_settings,
|
||||
get_model_metadata,
|
||||
get_nvidia_free_vram,
|
||||
update_gpu_layers_and_vram,
|
||||
update_model_parameters
|
||||
)
|
||||
from modules.shared import do_cmd_flags_warnings
|
||||
|
@ -250,15 +249,19 @@ if __name__ == "__main__":
|
|||
model_settings = get_model_metadata(model_name)
|
||||
update_model_parameters(model_settings, initial=True) # hijack the command-line arguments
|
||||
|
||||
if 'gpu_layers' not in shared.provided_arguments:
|
||||
available_vram = get_nvidia_free_vram()
|
||||
if available_vram > 0:
|
||||
n_layers = model_settings['gpu_layers']
|
||||
tolerance = 906
|
||||
while n_layers > 0 and estimate_vram(model_name, n_layers, shared.args.ctx_size, shared.args.cache_type) > available_vram - tolerance:
|
||||
n_layers -= 1
|
||||
# Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
|
||||
if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
|
||||
vram_usage, adjusted_layers = update_gpu_layers_and_vram(
|
||||
shared.args.loader,
|
||||
model_name,
|
||||
model_settings['gpu_layers'],
|
||||
shared.args.ctx_size,
|
||||
shared.args.cache_type,
|
||||
auto_adjust=True,
|
||||
for_ui=False
|
||||
)
|
||||
|
||||
shared.args.gpu_layers = n_layers
|
||||
shared.args.gpu_layers = adjusted_layers
|
||||
|
||||
# Load the model
|
||||
shared.model, shared.tokenizer = load_model(model_name)
|
||||
|
|
Loading…
Add table
Reference in a new issue