Merge pull request #6992 from oobabooga/dev

Merge dev branch
This commit is contained in:
oobabooga 2025-05-17 11:58:46 -03:00 committed by GitHub
commit e8595730b4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -2,7 +2,7 @@ import functools
import json import json
import re import re
import subprocess import subprocess
from math import exp from math import floor
from pathlib import Path from pathlib import Path
import gradio as gr import gradio as gr
@ -154,10 +154,11 @@ def get_model_metadata(model):
for pat in settings: for pat in settings:
if re.match(pat.lower(), Path(model).name.lower()): if re.match(pat.lower(), Path(model).name.lower()):
for k in settings[pat]: for k in settings[pat]:
new_k = k
if k == 'n_gpu_layers': if k == 'n_gpu_layers':
k = 'gpu_layers' new_k = 'gpu_layers'
model_settings[k] = settings[pat][k] model_settings[new_k] = settings[pat][k]
# Load instruction template if defined by name rather than by value # Load instruction template if defined by name rather than by value
if model_settings['instruction_template'] != 'Custom (obtained from model metadata)': if model_settings['instruction_template'] != 'Custom (obtained from model metadata)':
@ -331,8 +332,6 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
n_layers = None n_layers = None
n_kv_heads = None n_kv_heads = None
embedding_dim = None embedding_dim = None
context_length = None
feed_forward_dim = None
for key, value in metadata.items(): for key, value in metadata.items():
if key.endswith('.block_count'): if key.endswith('.block_count'):
@ -341,10 +340,6 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
n_kv_heads = value n_kv_heads = value
elif key.endswith('.embedding_length'): elif key.endswith('.embedding_length'):
embedding_dim = value embedding_dim = value
elif key.endswith('.context_length'):
context_length = value
elif key.endswith('.feed_forward_length'):
feed_forward_dim = value
if gpu_layers > n_layers: if gpu_layers > n_layers:
gpu_layers = n_layers gpu_layers = n_layers
@ -359,32 +354,29 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
# Derived features # Derived features
size_per_layer = size_in_mb / max(n_layers, 1e-6) size_per_layer = size_in_mb / max(n_layers, 1e-6)
context_per_layer = context_length / max(n_layers, 1e-6)
ffn_per_embedding = feed_forward_dim / max(embedding_dim, 1e-6)
kv_cache_factor = n_kv_heads * cache_type * ctx_size kv_cache_factor = n_kv_heads * cache_type * ctx_size
embedding_per_context = embedding_dim / ctx_size
# Helper function for smaller
def smaller(x, y):
return 1 if x < y else 0
# Calculate VRAM using the model # Calculate VRAM using the model
# Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/ # Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
vram = ( vram = (
(size_per_layer - 21.19195204848197) (size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor)
* exp(0.0001047328491557063 * size_in_mb * smaller(ffn_per_embedding, 2.671096993407845)) * (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632)))
+ 0.0006621544775632052 * context_per_layer + 1516.522943869404
+ 3.34664386576376e-05 * kv_cache_factor )
) * (1.363306170123392 + gpu_layers) + 1255.163594536052
return vram return vram
def get_nvidia_free_vram(): def get_nvidia_vram(return_free=True):
""" """
Calculates the total free VRAM across all NVIDIA GPUs by parsing nvidia-smi output. Calculates VRAM statistics across all NVIDIA GPUs by parsing nvidia-smi output.
Args:
return_free (bool): If True, returns free VRAM. If False, returns total VRAM.
Returns: Returns:
int: The total free VRAM in MiB summed across all detected NVIDIA GPUs. int: Either the total free VRAM or total VRAM in MiB summed across all detected NVIDIA GPUs.
Returns -1 if nvidia-smi command fails (not found, error, etc.). Returns -1 if nvidia-smi command fails (not found, error, etc.).
Returns 0 if nvidia-smi succeeds but no GPU memory info found. Returns 0 if nvidia-smi succeeds but no GPU memory info found.
""" """
@ -412,17 +404,21 @@ def get_nvidia_free_vram():
# No GPUs found in expected format # No GPUs found in expected format
return 0 return 0
total_vram_mib = 0
total_free_vram_mib = 0 total_free_vram_mib = 0
for used_mem_str, total_mem_str in matches: for used_mem_str, total_mem_str in matches:
try: try:
used_mib = int(used_mem_str) used_mib = int(used_mem_str)
total_mib = int(total_mem_str) total_mib = int(total_mem_str)
total_vram_mib += total_mib
total_free_vram_mib += (total_mib - used_mib) total_free_vram_mib += (total_mib - used_mib)
except ValueError: except ValueError:
# Skip malformed entries # Skip malformed entries
pass pass
return total_free_vram_mib # Return either free or total VRAM based on the flag
return total_free_vram_mib if return_free else total_vram_mib
except FileNotFoundError: except FileNotFoundError:
# nvidia-smi not found (likely no NVIDIA drivers installed) # nvidia-smi not found (likely no NVIDIA drivers installed)
@ -473,10 +469,12 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type,
# No user setting, auto-adjust from the maximum # No user setting, auto-adjust from the maximum
current_layers = max_layers # Start from max current_layers = max_layers # Start from max
# Auto-adjust based on available VRAM # Auto-adjust based on available/total VRAM
available_vram = get_nvidia_free_vram() # If a model is loaded and it's for the UI, use the total VRAM to avoid confusion
return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True
available_vram = get_nvidia_vram(return_free=return_free)
if available_vram > 0: if available_vram > 0:
tolerance = 906 tolerance = 577
while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance: while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
current_layers -= 1 current_layers -= 1