Fix exllamav3_hf models failing to unload (closes #7031)

2025-06-07 14:17:09 -04:00 · 2025-05-30 12:05:49 -07:00 · 2025-05-30 12:05:49 -07:00 · 219f0a7731
commit 219f0a7731
parent 298d4719c6
2 changed files with 20 additions and 0 deletions
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@ -245,3 +245,20 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
        pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
        return Exllamav3HF(pretrained_model_name_or_path)
    def unload(self):
        """Properly unload the ExllamaV3 model and free GPU memory."""
        if hasattr(self, 'ex_model') and self.ex_model is not None:
            self.ex_model.unload()
            self.ex_model = None
        if hasattr(self, 'ex_cache') and self.ex_cache is not None:
            self.ex_cache = None
        # Clean up any additional ExllamaV3 resources
        if hasattr(self, 'past_seq'):
            self.past_seq = None
        if hasattr(self, 'past_seq_negative'):
            self.past_seq_negative = None
        if hasattr(self, 'ex_cache_negative'):
            self.ex_cache_negative = None
--- a/modules/models.py
+++ b/modules/models.py
@ -116,10 +116,13 @@ def unload_model(keep_model_name=False):
        return
    is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
    if shared.args.loader == 'ExLlamav3_HF':
        shared.model.unload()
    shared.model = shared.tokenizer = None
    shared.lora_names = []
    shared.model_dirty_from_training = False
    if not is_llamacpp:
        from modules.torch_utils import clear_torch_cache
        clear_torch_cache()