text-generation-webui-mirror/modules/models_settings.py

492 lines
18 KiB
Python

import functools
import json
import re
import subprocess
from math import floor
from pathlib import Path
import gradio as gr
import yaml
from modules import chat, loaders, metadata_gguf, shared, ui
def get_fallback_settings():
return {
'bf16': False,
'use_eager_attention': False,
'ctx_size': 2048,
'rope_freq_base': 0,
'compress_pos_emb': 1,
'alpha_value': 1,
'truncation_length': shared.settings['truncation_length'],
'truncation_length_info': shared.settings['truncation_length'],
'skip_special_tokens': shared.settings['skip_special_tokens'],
'custom_stopping_strings': shared.settings['custom_stopping_strings'],
}
def get_model_metadata(model):
model_settings = {}
# Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml
settings = shared.model_config
for pat in settings:
if re.match(pat.lower(), Path(model).name.lower()):
for k in settings[pat]:
model_settings[k] = settings[pat][k]
path = Path(f'{shared.args.model_dir}/{model}/config.json')
if path.exists():
hf_metadata = json.loads(open(path, 'r', encoding='utf-8').read())
else:
hf_metadata = None
if 'loader' not in model_settings:
quant_method = None if hf_metadata is None else hf_metadata.get("quantization_config", {}).get("quant_method", None)
model_settings['loader'] = infer_loader(
model,
model_settings,
hf_quant_method=quant_method
)
# GGUF metadata
if model_settings['loader'] == 'llama.cpp':
path = Path(f'{shared.args.model_dir}/{model}')
if path.is_file():
model_file = path
else:
model_file = list(path.glob('*.gguf'))[0]
metadata = load_gguf_metadata_with_cache(model_file)
for k in metadata:
if k.endswith('context_length'):
model_settings['ctx_size'] = min(metadata[k], 8192)
model_settings['truncation_length_info'] = metadata[k]
elif k.endswith('rope.freq_base'):
model_settings['rope_freq_base'] = metadata[k]
elif k.endswith('rope.scale_linear'):
model_settings['compress_pos_emb'] = metadata[k]
elif k.endswith('rope.scaling.factor'):
model_settings['compress_pos_emb'] = metadata[k]
elif k.endswith('block_count'):
model_settings['gpu_layers'] = metadata[k] + 1
model_settings['max_gpu_layers'] = metadata[k] + 1
if 'tokenizer.chat_template' in metadata:
template = metadata['tokenizer.chat_template']
eos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.eos_token_id']]
if 'tokenizer.ggml.bos_token_id' in metadata:
bos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.bos_token_id']]
else:
bos_token = ""
template = template.replace('eos_token', "'{}'".format(eos_token))
template = template.replace('bos_token', "'{}'".format(bos_token))
template = re.sub(r'raise_exception\([^)]*\)', "''", template)
template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
model_settings['instruction_template_str'] = template
else:
# Transformers metadata
if hf_metadata is not None:
metadata = json.loads(open(path, 'r', encoding='utf-8').read())
if 'pretrained_config' in metadata:
metadata = metadata['pretrained_config']
for k in ['max_position_embeddings', 'model_max_length', 'max_seq_len']:
if k in metadata:
model_settings['truncation_length'] = metadata[k]
model_settings['truncation_length_info'] = metadata[k]
model_settings['ctx_size'] = min(metadata[k], 8192)
if 'rope_theta' in metadata:
model_settings['rope_freq_base'] = metadata['rope_theta']
elif 'attn_config' in metadata and 'rope_theta' in metadata['attn_config']:
model_settings['rope_freq_base'] = metadata['attn_config']['rope_theta']
if 'rope_scaling' in metadata and isinstance(metadata['rope_scaling'], dict) and all(key in metadata['rope_scaling'] for key in ('type', 'factor')):
if metadata['rope_scaling']['type'] == 'linear':
model_settings['compress_pos_emb'] = metadata['rope_scaling']['factor']
# For Gemma-2
if 'torch_dtype' in metadata and metadata['torch_dtype'] == 'bfloat16':
model_settings['bf16'] = True
# For Gemma-2
if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']:
model_settings['use_eager_attention'] = True
# Try to find the Jinja instruct template
path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json'
if path.exists():
metadata = json.loads(open(path, 'r', encoding='utf-8').read())
if 'chat_template' in metadata:
template = metadata['chat_template']
if isinstance(template, list):
template = template[0]['template']
for k in ['eos_token', 'bos_token']:
if k in metadata:
value = metadata[k]
if isinstance(value, dict):
value = value['content']
template = template.replace(k, "'{}'".format(value))
template = re.sub(r'raise_exception\([^)]*\)', "''", template)
template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL)
model_settings['instruction_template'] = 'Custom (obtained from model metadata)'
model_settings['instruction_template_str'] = template
if 'instruction_template' not in model_settings:
model_settings['instruction_template'] = 'Alpaca'
# Ignore rope_freq_base if set to the default value
if 'rope_freq_base' in model_settings and model_settings['rope_freq_base'] == 10000:
model_settings.pop('rope_freq_base')
# Apply user settings from user_data/models/config-user.yaml
settings = shared.user_config
for pat in settings:
if re.match(pat.lower(), Path(model).name.lower()):
for k in settings[pat]:
new_k = k
if k == 'n_gpu_layers':
new_k = 'gpu_layers'
model_settings[new_k] = settings[pat][k]
# Load instruction template if defined by name rather than by value
if model_settings['instruction_template'] != 'Custom (obtained from model metadata)':
model_settings['instruction_template_str'] = chat.load_instruction_template(model_settings['instruction_template'])
return model_settings
def infer_loader(model_name, model_settings, hf_quant_method=None):
path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
if not path_to_model.exists():
loader = None
elif len(list(path_to_model.glob('*.gguf'))) > 0:
loader = 'llama.cpp'
elif re.match(r'.*\.gguf', model_name.lower()):
loader = 'llama.cpp'
elif hf_quant_method == 'exl3':
loader = 'ExLlamav3_HF'
elif hf_quant_method in ['exl2', 'gptq']:
loader = 'ExLlamav2_HF'
elif re.match(r'.*exl3', model_name.lower()):
loader = 'ExLlamav3_HF'
elif re.match(r'.*exl2', model_name.lower()):
loader = 'ExLlamav2_HF'
else:
loader = 'Transformers'
return loader
def update_model_parameters(state, initial=False):
'''
UI: update the command-line arguments based on the interface values
'''
elements = ui.list_model_elements() # the names of the parameters
for i, element in enumerate(elements):
if element not in state:
continue
value = state[element]
if initial and element in shared.provided_arguments:
continue
if element == 'cpu_memory' and value == 0:
value = vars(shared.args_defaults)[element]
setattr(shared.args, element, value)
def apply_model_settings_to_state(model, state):
'''
UI: update the state variable with the model settings
'''
model_settings = get_model_metadata(model)
if 'loader' in model_settings:
loader = model_settings.pop('loader')
if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
state['loader'] = loader
for k in model_settings:
if k in state and k != 'gpu_layers': # Skip gpu_layers, handle separately
state[k] = model_settings[k]
# Handle GPU layers and VRAM update for llama.cpp
if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings:
vram_info, gpu_layers_update = update_gpu_layers_and_vram(
state['loader'],
model,
model_settings['gpu_layers'],
state['ctx_size'],
state['cache_type'],
auto_adjust=True
)
state['gpu_layers'] = gpu_layers_update
state['vram_info'] = vram_info
return state
def save_model_settings(model, state):
'''
Save the settings for this model to user_data/models/config-user.yaml
'''
if model == 'None':
yield ("Not saving the settings because no model is selected in the menu.")
return
user_config = shared.load_user_config()
model_regex = Path(model).name + '$' # For exact matches
if model_regex not in user_config:
user_config[model_regex] = {}
for k in ui.list_model_elements():
if k == 'loader' or k in loaders.loaders_and_params[state['loader']]:
user_config[model_regex][k] = state[k]
shared.user_config = user_config
output = yaml.dump(user_config, sort_keys=False)
p = Path(f'{shared.args.model_dir}/config-user.yaml')
with open(p, 'w') as f:
f.write(output)
yield (f"Settings for `{model}` saved to `{p}`.")
def save_instruction_template(model, template):
'''
Similar to the function above, but it saves only the instruction template.
'''
if model == 'None':
yield ("Not saving the template because no model is selected in the menu.")
return
user_config = shared.load_user_config()
model_regex = Path(model).name + '$' # For exact matches
if model_regex not in user_config:
user_config[model_regex] = {}
if template == 'None':
user_config[model_regex].pop('instruction_template', None)
else:
user_config[model_regex]['instruction_template'] = template
shared.user_config = user_config
output = yaml.dump(user_config, sort_keys=False)
p = Path(f'{shared.args.model_dir}/config-user.yaml')
with open(p, 'w') as f:
f.write(output)
if template == 'None':
yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.")
else:
yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")
@functools.lru_cache(maxsize=1)
def load_gguf_metadata_with_cache(model_file):
return metadata_gguf.load_metadata(model_file)
def get_model_size_mb(model_file: Path) -> float:
filename = model_file.name
# Check for multipart pattern
match = re.match(r'(.+)-\d+-of-\d+\.gguf$', filename)
if match:
# It's a multipart file, find all matching parts
base_pattern = match.group(1)
part_files = sorted(model_file.parent.glob(f'{base_pattern}-*-of-*.gguf'))
total_size = sum(p.stat().st_size for p in part_files)
else:
# Single part
total_size = model_file.stat().st_size
return total_size / (1024 ** 2) # Return size in MB
def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
model_file = Path(f'{shared.args.model_dir}/{gguf_file}')
metadata = load_gguf_metadata_with_cache(model_file)
size_in_mb = get_model_size_mb(model_file)
# Extract values from metadata
n_layers = None
n_kv_heads = None
embedding_dim = None
for key, value in metadata.items():
if key.endswith('.block_count'):
n_layers = value
elif key.endswith('.attention.head_count_kv'):
n_kv_heads = max(value) if isinstance(value, list) else value
elif key.endswith('.embedding_length'):
embedding_dim = value
if gpu_layers > n_layers:
gpu_layers = n_layers
# Convert cache_type to numeric
if cache_type == 'q4_0':
cache_type = 4
elif cache_type == 'q8_0':
cache_type = 8
else:
cache_type = 16
# Derived features
size_per_layer = size_in_mb / max(n_layers, 1e-6)
kv_cache_factor = n_kv_heads * cache_type * ctx_size
embedding_per_context = embedding_dim / ctx_size
# Calculate VRAM using the model
# Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
vram = (
(size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor)
* (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632)))
+ 1516.522943869404
)
return vram
def get_nvidia_vram(return_free=True):
"""
Calculates VRAM statistics across all NVIDIA GPUs by parsing nvidia-smi output.
Args:
return_free (bool): If True, returns free VRAM. If False, returns total VRAM.
Returns:
int: Either the total free VRAM or total VRAM in MiB summed across all detected NVIDIA GPUs.
Returns -1 if nvidia-smi command fails (not found, error, etc.).
Returns 0 if nvidia-smi succeeds but no GPU memory info found.
"""
try:
# Execute nvidia-smi command
result = subprocess.run(
['nvidia-smi'],
capture_output=True,
text=True,
check=False
)
# Check if nvidia-smi returned an error
if result.returncode != 0:
return -1
# Parse the output for memory usage patterns
output = result.stdout
# Find memory usage like "XXXXMiB / YYYYMiB"
# Captures used and total memory for each GPU
matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output)
if not matches:
# No GPUs found in expected format
return 0
total_vram_mib = 0
total_free_vram_mib = 0
for used_mem_str, total_mem_str in matches:
try:
used_mib = int(used_mem_str)
total_mib = int(total_mem_str)
total_vram_mib += total_mib
total_free_vram_mib += (total_mib - used_mib)
except ValueError:
# Skip malformed entries
pass
# Return either free or total VRAM based on the flag
return total_free_vram_mib if return_free else total_vram_mib
except FileNotFoundError:
# nvidia-smi not found (likely no NVIDIA drivers installed)
return -1
except Exception:
# Handle any other unexpected exceptions
return -1
def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True):
"""
Unified function to handle GPU layers and VRAM updates.
Args:
for_ui: If True, returns Gradio updates. If False, returns raw values.
Returns:
- If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update
- If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage
"""
if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"):
vram_info = "<div id=\"vram-info\"'>Estimated VRAM to load the model:</div>"
if for_ui:
return (vram_info, gr.update()) if auto_adjust else vram_info
else:
return (0, gpu_layers) if auto_adjust else 0
current_layers = gpu_layers
max_layers = gpu_layers
if auto_adjust:
# Get model settings including user preferences
model_settings = get_model_metadata(model)
# Get the true maximum layers
max_layers = model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', gpu_layers))
# Check if this is a user-saved setting
user_config = shared.user_config
model_regex = Path(model).name + '$'
has_user_setting = model_regex in user_config and 'gpu_layers' in user_config[model_regex]
if has_user_setting:
# For user settings, just use the current value (which already has user pref)
# but ensure the slider maximum is correct
current_layers = gpu_layers # Already has user setting
else:
# No user setting, auto-adjust from the maximum
current_layers = max_layers # Start from max
# Auto-adjust based on available/total VRAM
# If a model is loaded and it's for the UI, use the total VRAM to avoid confusion
return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True
available_vram = get_nvidia_vram(return_free=return_free)
if available_vram > 0:
tolerance = 577
while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
current_layers -= 1
# Calculate VRAM with current layers
vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type)
if for_ui:
vram_info = f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{vram_usage:.0f} MiB</span></div>"
if auto_adjust:
return vram_info, gr.update(value=current_layers, maximum=max_layers)
else:
return vram_info
else:
if auto_adjust:
return vram_usage, current_layers
else:
return vram_usage