import functools import json import re import subprocess from math import floor from pathlib import Path import gradio as gr import yaml from modules import chat, loaders, metadata_gguf, shared, ui def get_fallback_settings(): return { 'bf16': False, 'use_eager_attention': False, 'ctx_size': 2048, 'rope_freq_base': 0, 'compress_pos_emb': 1, 'alpha_value': 1, 'truncation_length': shared.settings['truncation_length'], 'truncation_length_info': shared.settings['truncation_length'], 'skip_special_tokens': shared.settings['skip_special_tokens'], 'custom_stopping_strings': shared.settings['custom_stopping_strings'], } def get_model_metadata(model): model_settings = {} # Get settings from user_data/models/config.yaml and user_data/models/config-user.yaml settings = shared.model_config for pat in settings: if re.match(pat.lower(), Path(model).name.lower()): for k in settings[pat]: model_settings[k] = settings[pat][k] path = Path(f'{shared.args.model_dir}/{model}/config.json') if path.exists(): hf_metadata = json.loads(open(path, 'r', encoding='utf-8').read()) else: hf_metadata = None if 'loader' not in model_settings: quant_method = None if hf_metadata is None else hf_metadata.get("quantization_config", {}).get("quant_method", None) model_settings['loader'] = infer_loader( model, model_settings, hf_quant_method=quant_method ) # GGUF metadata if model_settings['loader'] == 'llama.cpp': path = Path(f'{shared.args.model_dir}/{model}') if path.is_file(): model_file = path else: model_file = list(path.glob('*.gguf'))[0] metadata = load_gguf_metadata_with_cache(model_file) for k in metadata: if k.endswith('context_length'): model_settings['ctx_size'] = min(metadata[k], 8192) model_settings['truncation_length_info'] = metadata[k] elif k.endswith('rope.freq_base'): model_settings['rope_freq_base'] = metadata[k] elif k.endswith('rope.scale_linear'): model_settings['compress_pos_emb'] = metadata[k] elif k.endswith('rope.scaling.factor'): model_settings['compress_pos_emb'] = metadata[k] elif k.endswith('block_count'): model_settings['gpu_layers'] = metadata[k] + 1 model_settings['max_gpu_layers'] = metadata[k] + 1 if 'tokenizer.chat_template' in metadata: template = metadata['tokenizer.chat_template'] eos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.eos_token_id']] if 'tokenizer.ggml.bos_token_id' in metadata: bos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.bos_token_id']] else: bos_token = "" template = template.replace('eos_token', "'{}'".format(eos_token)) template = template.replace('bos_token', "'{}'".format(bos_token)) template = re.sub(r'raise_exception\([^)]*\)', "''", template) template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template else: # Transformers metadata if hf_metadata is not None: metadata = json.loads(open(path, 'r', encoding='utf-8').read()) if 'pretrained_config' in metadata: metadata = metadata['pretrained_config'] for k in ['max_position_embeddings', 'model_max_length', 'max_seq_len']: if k in metadata: model_settings['truncation_length'] = metadata[k] model_settings['truncation_length_info'] = metadata[k] model_settings['ctx_size'] = min(metadata[k], 8192) if 'rope_theta' in metadata: model_settings['rope_freq_base'] = metadata['rope_theta'] elif 'attn_config' in metadata and 'rope_theta' in metadata['attn_config']: model_settings['rope_freq_base'] = metadata['attn_config']['rope_theta'] if 'rope_scaling' in metadata and isinstance(metadata['rope_scaling'], dict) and all(key in metadata['rope_scaling'] for key in ('type', 'factor')): if metadata['rope_scaling']['type'] == 'linear': model_settings['compress_pos_emb'] = metadata['rope_scaling']['factor'] # For Gemma-2 if 'torch_dtype' in metadata and metadata['torch_dtype'] == 'bfloat16': model_settings['bf16'] = True # For Gemma-2 if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']: model_settings['use_eager_attention'] = True # Try to find the Jinja instruct template path = Path(f'{shared.args.model_dir}/{model}') / 'tokenizer_config.json' if path.exists(): metadata = json.loads(open(path, 'r', encoding='utf-8').read()) if 'chat_template' in metadata: template = metadata['chat_template'] if isinstance(template, list): template = template[0]['template'] for k in ['eos_token', 'bos_token']: if k in metadata: value = metadata[k] if isinstance(value, dict): value = value['content'] template = template.replace(k, "'{}'".format(value)) template = re.sub(r'raise_exception\([^)]*\)', "''", template) template = re.sub(r'{% if add_generation_prompt %}.*', '', template, flags=re.DOTALL) model_settings['instruction_template'] = 'Custom (obtained from model metadata)' model_settings['instruction_template_str'] = template if 'instruction_template' not in model_settings: model_settings['instruction_template'] = 'Alpaca' # Ignore rope_freq_base if set to the default value if 'rope_freq_base' in model_settings and model_settings['rope_freq_base'] == 10000: model_settings.pop('rope_freq_base') # Apply user settings from user_data/models/config-user.yaml settings = shared.user_config for pat in settings: if re.match(pat.lower(), Path(model).name.lower()): for k in settings[pat]: new_k = k if k == 'n_gpu_layers': new_k = 'gpu_layers' model_settings[new_k] = settings[pat][k] # Load instruction template if defined by name rather than by value if model_settings['instruction_template'] != 'Custom (obtained from model metadata)': model_settings['instruction_template_str'] = chat.load_instruction_template(model_settings['instruction_template']) return model_settings def infer_loader(model_name, model_settings, hf_quant_method=None): path_to_model = Path(f'{shared.args.model_dir}/{model_name}') if not path_to_model.exists(): loader = None elif len(list(path_to_model.glob('*.gguf'))) > 0: loader = 'llama.cpp' elif re.match(r'.*\.gguf', model_name.lower()): loader = 'llama.cpp' elif hf_quant_method == 'exl3': loader = 'ExLlamav3_HF' elif hf_quant_method in ['exl2', 'gptq']: loader = 'ExLlamav2_HF' elif re.match(r'.*exl3', model_name.lower()): loader = 'ExLlamav3_HF' elif re.match(r'.*exl2', model_name.lower()): loader = 'ExLlamav2_HF' else: loader = 'Transformers' return loader def update_model_parameters(state, initial=False): ''' UI: update the command-line arguments based on the interface values ''' elements = ui.list_model_elements() # the names of the parameters for i, element in enumerate(elements): if element not in state: continue value = state[element] if initial and element in shared.provided_arguments: continue if element == 'cpu_memory' and value == 0: value = vars(shared.args_defaults)[element] setattr(shared.args, element, value) def apply_model_settings_to_state(model, state): ''' UI: update the state variable with the model settings ''' model_settings = get_model_metadata(model) if 'loader' in model_settings: loader = model_settings.pop('loader') if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']): state['loader'] = loader for k in model_settings: if k in state and k != 'gpu_layers': # Skip gpu_layers, handle separately state[k] = model_settings[k] # Handle GPU layers and VRAM update for llama.cpp if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings: vram_info, gpu_layers_update = update_gpu_layers_and_vram( state['loader'], model, model_settings['gpu_layers'], state['ctx_size'], state['cache_type'], auto_adjust=True ) state['gpu_layers'] = gpu_layers_update state['vram_info'] = vram_info return state def save_model_settings(model, state): ''' Save the settings for this model to user_data/models/config-user.yaml ''' if model == 'None': yield ("Not saving the settings because no model is selected in the menu.") return user_config = shared.load_user_config() model_regex = Path(model).name + '$' # For exact matches if model_regex not in user_config: user_config[model_regex] = {} for k in ui.list_model_elements(): if k == 'loader' or k in loaders.loaders_and_params[state['loader']]: user_config[model_regex][k] = state[k] shared.user_config = user_config output = yaml.dump(user_config, sort_keys=False) p = Path(f'{shared.args.model_dir}/config-user.yaml') with open(p, 'w') as f: f.write(output) yield (f"Settings for `{model}` saved to `{p}`.") def save_instruction_template(model, template): ''' Similar to the function above, but it saves only the instruction template. ''' if model == 'None': yield ("Not saving the template because no model is selected in the menu.") return user_config = shared.load_user_config() model_regex = Path(model).name + '$' # For exact matches if model_regex not in user_config: user_config[model_regex] = {} if template == 'None': user_config[model_regex].pop('instruction_template', None) else: user_config[model_regex]['instruction_template'] = template shared.user_config = user_config output = yaml.dump(user_config, sort_keys=False) p = Path(f'{shared.args.model_dir}/config-user.yaml') with open(p, 'w') as f: f.write(output) if template == 'None': yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.") else: yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.") @functools.lru_cache(maxsize=1) def load_gguf_metadata_with_cache(model_file): return metadata_gguf.load_metadata(model_file) def get_model_size_mb(model_file: Path) -> float: filename = model_file.name # Check for multipart pattern match = re.match(r'(.+)-\d+-of-\d+\.gguf$', filename) if match: # It's a multipart file, find all matching parts base_pattern = match.group(1) part_files = sorted(model_file.parent.glob(f'{base_pattern}-*-of-*.gguf')) total_size = sum(p.stat().st_size for p in part_files) else: # Single part total_size = model_file.stat().st_size return total_size / (1024 ** 2) # Return size in MB def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): model_file = Path(f'{shared.args.model_dir}/{gguf_file}') metadata = load_gguf_metadata_with_cache(model_file) size_in_mb = get_model_size_mb(model_file) # Extract values from metadata n_layers = None n_kv_heads = None embedding_dim = None for key, value in metadata.items(): if key.endswith('.block_count'): n_layers = value elif key.endswith('.attention.head_count_kv'): n_kv_heads = max(value) if isinstance(value, list) else value elif key.endswith('.embedding_length'): embedding_dim = value if gpu_layers > n_layers: gpu_layers = n_layers # Convert cache_type to numeric if cache_type == 'q4_0': cache_type = 4 elif cache_type == 'q8_0': cache_type = 8 else: cache_type = 16 # Derived features size_per_layer = size_in_mb / max(n_layers, 1e-6) kv_cache_factor = n_kv_heads * cache_type * ctx_size embedding_per_context = embedding_dim / ctx_size # Calculate VRAM using the model # Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/ vram = ( (size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor) * (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632))) + 1516.522943869404 ) return vram def get_nvidia_vram(return_free=True): """ Calculates VRAM statistics across all NVIDIA GPUs by parsing nvidia-smi output. Args: return_free (bool): If True, returns free VRAM. If False, returns total VRAM. Returns: int: Either the total free VRAM or total VRAM in MiB summed across all detected NVIDIA GPUs. Returns -1 if nvidia-smi command fails (not found, error, etc.). Returns 0 if nvidia-smi succeeds but no GPU memory info found. """ try: # Execute nvidia-smi command result = subprocess.run( ['nvidia-smi'], capture_output=True, text=True, check=False ) # Check if nvidia-smi returned an error if result.returncode != 0: return -1 # Parse the output for memory usage patterns output = result.stdout # Find memory usage like "XXXXMiB / YYYYMiB" # Captures used and total memory for each GPU matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output) if not matches: # No GPUs found in expected format return 0 total_vram_mib = 0 total_free_vram_mib = 0 for used_mem_str, total_mem_str in matches: try: used_mib = int(used_mem_str) total_mib = int(total_mem_str) total_vram_mib += total_mib total_free_vram_mib += (total_mib - used_mib) except ValueError: # Skip malformed entries pass # Return either free or total VRAM based on the flag return total_free_vram_mib if return_free else total_vram_mib except FileNotFoundError: # nvidia-smi not found (likely no NVIDIA drivers installed) return -1 except Exception: # Handle any other unexpected exceptions return -1 def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True): """ Unified function to handle GPU layers and VRAM updates. Args: for_ui: If True, returns Gradio updates. If False, returns raw values. Returns: - If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update - If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage """ if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"): vram_info = "
Estimated VRAM to load the model:
" if for_ui: return (vram_info, gr.update()) if auto_adjust else vram_info else: return (0, gpu_layers) if auto_adjust else 0 current_layers = gpu_layers max_layers = gpu_layers if auto_adjust: # Get model settings including user preferences model_settings = get_model_metadata(model) # Get the true maximum layers max_layers = model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', gpu_layers)) # Check if this is a user-saved setting user_config = shared.user_config model_regex = Path(model).name + '$' has_user_setting = model_regex in user_config and 'gpu_layers' in user_config[model_regex] if has_user_setting: # For user settings, just use the current value (which already has user pref) # but ensure the slider maximum is correct current_layers = gpu_layers # Already has user setting else: # No user setting, auto-adjust from the maximum current_layers = max_layers # Start from max # Auto-adjust based on available/total VRAM # If a model is loaded and it's for the UI, use the total VRAM to avoid confusion return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True available_vram = get_nvidia_vram(return_free=return_free) if available_vram > 0: tolerance = 577 while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance: current_layers -= 1 # Calculate VRAM with current layers vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type) if for_ui: vram_info = f"
Estimated VRAM to load the model: {vram_usage:.0f} MiB
" if auto_adjust: return vram_info, gr.update(value=current_layers, maximum=max_layers) else: return vram_info else: if auto_adjust: return vram_usage, current_layers else: return vram_usage