mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-06-08 22:56:24 -04:00
commit
e8595730b4
1 changed files with 25 additions and 27 deletions
|
@ -2,7 +2,7 @@ import functools
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
from math import exp
|
from math import floor
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
|
@ -154,10 +154,11 @@ def get_model_metadata(model):
|
||||||
for pat in settings:
|
for pat in settings:
|
||||||
if re.match(pat.lower(), Path(model).name.lower()):
|
if re.match(pat.lower(), Path(model).name.lower()):
|
||||||
for k in settings[pat]:
|
for k in settings[pat]:
|
||||||
|
new_k = k
|
||||||
if k == 'n_gpu_layers':
|
if k == 'n_gpu_layers':
|
||||||
k = 'gpu_layers'
|
new_k = 'gpu_layers'
|
||||||
|
|
||||||
model_settings[k] = settings[pat][k]
|
model_settings[new_k] = settings[pat][k]
|
||||||
|
|
||||||
# Load instruction template if defined by name rather than by value
|
# Load instruction template if defined by name rather than by value
|
||||||
if model_settings['instruction_template'] != 'Custom (obtained from model metadata)':
|
if model_settings['instruction_template'] != 'Custom (obtained from model metadata)':
|
||||||
|
@ -331,8 +332,6 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
|
||||||
n_layers = None
|
n_layers = None
|
||||||
n_kv_heads = None
|
n_kv_heads = None
|
||||||
embedding_dim = None
|
embedding_dim = None
|
||||||
context_length = None
|
|
||||||
feed_forward_dim = None
|
|
||||||
|
|
||||||
for key, value in metadata.items():
|
for key, value in metadata.items():
|
||||||
if key.endswith('.block_count'):
|
if key.endswith('.block_count'):
|
||||||
|
@ -341,10 +340,6 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
|
||||||
n_kv_heads = value
|
n_kv_heads = value
|
||||||
elif key.endswith('.embedding_length'):
|
elif key.endswith('.embedding_length'):
|
||||||
embedding_dim = value
|
embedding_dim = value
|
||||||
elif key.endswith('.context_length'):
|
|
||||||
context_length = value
|
|
||||||
elif key.endswith('.feed_forward_length'):
|
|
||||||
feed_forward_dim = value
|
|
||||||
|
|
||||||
if gpu_layers > n_layers:
|
if gpu_layers > n_layers:
|
||||||
gpu_layers = n_layers
|
gpu_layers = n_layers
|
||||||
|
@ -359,32 +354,29 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
|
||||||
|
|
||||||
# Derived features
|
# Derived features
|
||||||
size_per_layer = size_in_mb / max(n_layers, 1e-6)
|
size_per_layer = size_in_mb / max(n_layers, 1e-6)
|
||||||
context_per_layer = context_length / max(n_layers, 1e-6)
|
|
||||||
ffn_per_embedding = feed_forward_dim / max(embedding_dim, 1e-6)
|
|
||||||
kv_cache_factor = n_kv_heads * cache_type * ctx_size
|
kv_cache_factor = n_kv_heads * cache_type * ctx_size
|
||||||
|
embedding_per_context = embedding_dim / ctx_size
|
||||||
# Helper function for smaller
|
|
||||||
def smaller(x, y):
|
|
||||||
return 1 if x < y else 0
|
|
||||||
|
|
||||||
# Calculate VRAM using the model
|
# Calculate VRAM using the model
|
||||||
# Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
|
# Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
|
||||||
vram = (
|
vram = (
|
||||||
(size_per_layer - 21.19195204848197)
|
(size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor)
|
||||||
* exp(0.0001047328491557063 * size_in_mb * smaller(ffn_per_embedding, 2.671096993407845))
|
* (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632)))
|
||||||
+ 0.0006621544775632052 * context_per_layer
|
+ 1516.522943869404
|
||||||
+ 3.34664386576376e-05 * kv_cache_factor
|
)
|
||||||
) * (1.363306170123392 + gpu_layers) + 1255.163594536052
|
|
||||||
|
|
||||||
return vram
|
return vram
|
||||||
|
|
||||||
|
|
||||||
def get_nvidia_free_vram():
|
def get_nvidia_vram(return_free=True):
|
||||||
"""
|
"""
|
||||||
Calculates the total free VRAM across all NVIDIA GPUs by parsing nvidia-smi output.
|
Calculates VRAM statistics across all NVIDIA GPUs by parsing nvidia-smi output.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
return_free (bool): If True, returns free VRAM. If False, returns total VRAM.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
int: The total free VRAM in MiB summed across all detected NVIDIA GPUs.
|
int: Either the total free VRAM or total VRAM in MiB summed across all detected NVIDIA GPUs.
|
||||||
Returns -1 if nvidia-smi command fails (not found, error, etc.).
|
Returns -1 if nvidia-smi command fails (not found, error, etc.).
|
||||||
Returns 0 if nvidia-smi succeeds but no GPU memory info found.
|
Returns 0 if nvidia-smi succeeds but no GPU memory info found.
|
||||||
"""
|
"""
|
||||||
|
@ -412,17 +404,21 @@ def get_nvidia_free_vram():
|
||||||
# No GPUs found in expected format
|
# No GPUs found in expected format
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
total_vram_mib = 0
|
||||||
total_free_vram_mib = 0
|
total_free_vram_mib = 0
|
||||||
|
|
||||||
for used_mem_str, total_mem_str in matches:
|
for used_mem_str, total_mem_str in matches:
|
||||||
try:
|
try:
|
||||||
used_mib = int(used_mem_str)
|
used_mib = int(used_mem_str)
|
||||||
total_mib = int(total_mem_str)
|
total_mib = int(total_mem_str)
|
||||||
|
total_vram_mib += total_mib
|
||||||
total_free_vram_mib += (total_mib - used_mib)
|
total_free_vram_mib += (total_mib - used_mib)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# Skip malformed entries
|
# Skip malformed entries
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return total_free_vram_mib
|
# Return either free or total VRAM based on the flag
|
||||||
|
return total_free_vram_mib if return_free else total_vram_mib
|
||||||
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
# nvidia-smi not found (likely no NVIDIA drivers installed)
|
# nvidia-smi not found (likely no NVIDIA drivers installed)
|
||||||
|
@ -473,10 +469,12 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type,
|
||||||
# No user setting, auto-adjust from the maximum
|
# No user setting, auto-adjust from the maximum
|
||||||
current_layers = max_layers # Start from max
|
current_layers = max_layers # Start from max
|
||||||
|
|
||||||
# Auto-adjust based on available VRAM
|
# Auto-adjust based on available/total VRAM
|
||||||
available_vram = get_nvidia_free_vram()
|
# If a model is loaded and it's for the UI, use the total VRAM to avoid confusion
|
||||||
|
return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True
|
||||||
|
available_vram = get_nvidia_vram(return_free=return_free)
|
||||||
if available_vram > 0:
|
if available_vram > 0:
|
||||||
tolerance = 906
|
tolerance = 577
|
||||||
while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
|
while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
|
||||||
current_layers -= 1
|
current_layers -= 1
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue