Remove the HQQ loader (HQQ models can be loaded through Transformers)

2025-06-07 14:17:09 -04:00 · 2025-05-19 09:23:24 -07:00 · 2025-05-19 09:23:24 -07:00 · 9ec46b8c44
commit 9ec46b8c44
parent 0c7237e4b7
7 changed files with 2 additions and 37 deletions
--- a/README.md
+++ b/README.md
@ -12,9 +12,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.

 ## Features

- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2).
-  - [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is also supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile).
-  - Additional quantization libraries like [HQQ](https://github.com/mobiusml/hqq) and [AQLM](https://github.com/Vahe1994/AQLM) can be used with the Transformers loader if you install them manually.
+- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
 - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
 - UI that resembles the original ChatGPT style.
 - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -90,11 +90,6 @@ loaders_and_params = OrderedDict({
        'ctx_size_draft',
        'speculative_decoding_accordion',
    ],
-    'HQQ': [
-        'hqq_backend',
-        'trust_remote_code',
-        'no_use_fast',
-    ],
    'TensorRT-LLM': [
        'ctx_size',
        'cpp_runner',
@ -158,7 +153,6 @@ def transformers_samplers():

 loaders_samplers = {
    'Transformers': transformers_samplers(),
-    'HQQ': transformers_samplers(),
    'ExLlamav3_HF': {
        'temperature',
        'dynatemp_low',
--- a/modules/models.py
+++ b/modules/models.py
@ -21,7 +21,6 @@ def load_model(model_name, loader=None):
        'ExLlamav3_HF': ExLlamav3_HF_loader,
        'ExLlamav2_HF': ExLlamav2_HF_loader,
        'ExLlamav2': ExLlamav2_loader,
-        'HQQ': HQQ_loader,
        'TensorRT-LLM': TensorRT_LLM_loader,
    }

@ -102,21 +101,6 @@ def ExLlamav2_loader(model_name):
    return model, tokenizer


-def HQQ_loader(model_name):
-    try:
-        from hqq.core.quantize import HQQBackend, HQQLinear
-        from hqq.models.hf.base import AutoHQQHFModel
-    except ModuleNotFoundError:
-        raise ModuleNotFoundError("Failed to import 'hqq'. Please install it manually following the instructions in the HQQ GitHub repository.")
-
-    logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"")
-
-    model_dir = Path(f'{shared.args.model_dir}/{model_name}')
-    model = AutoHQQHFModel.from_quantized(str(model_dir))
-    HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend))
-    return model
-
-
 def TensorRT_LLM_loader(model_name):
    try:
        from modules.tensorrt_llm import TensorRTLLMModel
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@ -183,8 +183,6 @@ def infer_loader(model_name, model_settings, hf_quant_method=None):
        loader = 'ExLlamav3_HF'
    elif re.match(r'.*exl2', model_name.lower()):
        loader = 'ExLlamav2_HF'
-    elif re.match(r'.*-hqq', model_name.lower()):
-        return 'HQQ'
    else:
        loader = 'Transformers'

--- a/modules/shared.py
+++ b/modules/shared.py
@ -87,7 +87,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft

 # Model loader
 group = parser.add_argument_group('Model loader')
-group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ, TensorRT-LLM.')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.')

 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
@ -152,10 +152,6 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n
 group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
 group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')

-# HQQ
-group = parser.add_argument_group('HQQ')
-group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
-
 # TensorRT-LLM
 group = parser.add_argument_group('TensorRT-LLM')
 group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
@ -263,8 +259,6 @@ def fix_loader_name(name):
        return 'ExLlamav2_HF'
    elif name in ['exllamav3-hf', 'exllamav3_hf', 'exllama-v3-hf', 'exllama_v3_hf', 'exllama-v3_hf', 'exllama3-hf', 'exllama3_hf', 'exllama-3-hf', 'exllama_3_hf', 'exllama-3_hf']:
        return 'ExLlamav3_HF'
-    elif name in ['hqq']:
-        return 'HQQ'
    elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
        return 'TensorRT-LLM'

--- a/modules/ui.py
+++ b/modules/ui.py
@ -109,7 +109,6 @@ def list_model_elements():
        'threads',
        'threads_batch',
        'batch_size',
-        'hqq_backend',
        'ctx_size',
        'cache_type',
        'tensor_split',
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -42,8 +42,6 @@ def create_ui():
                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.')
                            shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
                            shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
-                            shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
-
                        with gr.Column():
                            shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')