mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-06-08 06:35:57 -04:00
Fix GPU layers slider: honor saved settings and show true maximum
This commit is contained in:
parent
e4d3f4449d
commit
1c549d176b
2 changed files with 22 additions and 10 deletions
|
@ -72,6 +72,7 @@ def get_model_metadata(model):
|
||||||
model_settings['compress_pos_emb'] = metadata[k]
|
model_settings['compress_pos_emb'] = metadata[k]
|
||||||
elif k.endswith('block_count'):
|
elif k.endswith('block_count'):
|
||||||
model_settings['gpu_layers'] = metadata[k] + 1
|
model_settings['gpu_layers'] = metadata[k] + 1
|
||||||
|
model_settings['max_gpu_layers'] = metadata[k] + 1
|
||||||
|
|
||||||
if 'tokenizer.chat_template' in metadata:
|
if 'tokenizer.chat_template' in metadata:
|
||||||
template = metadata['tokenizer.chat_template']
|
template = metadata['tokenizer.chat_template']
|
||||||
|
@ -450,15 +451,26 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type,
|
||||||
max_layers = gpu_layers
|
max_layers = gpu_layers
|
||||||
|
|
||||||
if auto_adjust:
|
if auto_adjust:
|
||||||
# Get max layers from model metadata
|
# Get model settings including user preferences
|
||||||
model_settings = get_model_metadata(model)
|
model_settings = get_model_metadata(model)
|
||||||
max_layers = model_settings.get('gpu_layers', gpu_layers)
|
|
||||||
|
# Check if the value is from user config-user.yaml
|
||||||
|
user_config = shared.user_config
|
||||||
|
model_regex = Path(model).name + '$'
|
||||||
|
has_user_setting = model_regex in user_config and 'gpu_layers' in user_config[model_regex]
|
||||||
|
|
||||||
|
if has_user_setting:
|
||||||
|
# Just return the current user value without adjustment
|
||||||
|
max_layers = model_settings.get('max_gpu_layers', 256)
|
||||||
|
else:
|
||||||
|
# No user setting, use model's max and auto-adjust
|
||||||
|
max_layers = model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', gpu_layers))
|
||||||
|
current_layers = max_layers # Start from max
|
||||||
|
|
||||||
# Auto-adjust based on available VRAM
|
# Auto-adjust based on available VRAM
|
||||||
available_vram = get_nvidia_free_vram()
|
available_vram = get_nvidia_free_vram()
|
||||||
if available_vram > 0:
|
if available_vram > 0:
|
||||||
tolerance = 906
|
tolerance = 906
|
||||||
current_layers = max_layers
|
|
||||||
while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
|
while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
|
||||||
current_layers -= 1
|
current_layers -= 1
|
||||||
|
|
||||||
|
|
|
@ -318,7 +318,7 @@ def get_initial_vram_info():
|
||||||
def get_initial_gpu_layers_max():
|
def get_initial_gpu_layers_max():
|
||||||
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
|
if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
|
||||||
model_settings = get_model_metadata(shared.model_name)
|
model_settings = get_model_metadata(shared.model_name)
|
||||||
return model_settings.get('gpu_layers', 256)
|
return model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', 256))
|
||||||
|
|
||||||
return 256
|
return 256
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue