Merge branch 'main' into MPT

# Conflicts: # auto_gptq/modeling/__init__.py # auto_gptq/modeling/_const.py # auto_gptq/modeling/auto.py
2023-07-26 20:41:19 +03:00 · 2023-07-26 20:41:19 +03:00 · 6ff6bc8dfc
commit 6ff6bc8dfc
parent bf47892b81 a7167b108c
37 changed files with 3503 additions and 258905 deletions
--- a/.github/workflows/build_wheels_cuda.yml
+++ b/.github/workflows/build_wheels_cuda.yml
@ -1,4 +1,4 @@
-name: Build AutoGPTQ Wheels
+name: Build AutoGPTQ Wheels with CUDA
 on: workflow_dispatch
@ -51,7 +51,7 @@ jobs:
          if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH}
          $env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6+PTX'
          if ([decimal]$env:CUDA_VERSION -ge 11.8) { $env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-          python -m build -n
+          python setup.py sdist bdist_wheel
      - uses: actions/upload-artifact@v3
        if: runner.os == 'Linux'
@ -64,37 +64,3 @@ jobs:
        with:
          name: 'windows-wheels'
          path: ./dist/*.whl
  build_sdist:
    name: Build source distribution
    runs-on: ubuntu-latest
    defaults:
      run:
        shell: pwsh
    steps:
      - uses: actions/checkout@v3
        with:
          ref: 'main'
      - uses: actions/setup-python@v3
        with:
          python-version: "3.10"
      - name: Install Dependencies
        run: |
          python -m pip install --upgrade build setuptools wheel
      - name: Build Wheel
        run: |
          python -m build -n
      - uses: actions/upload-artifact@v3
        with:
          name: 'sdist'
          path: ./dist/*.tar.gz
      - uses: actions/upload-artifact@v3
        with:
          name: 'no-cuda-wheel'
          path: ./dist/*.whl
--- a/README.md
+++ b/README.md
@ -12,14 +12,15 @@
    <p>
        <b>English</b> |
        <a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README_zh.md">中文</a>
-    <p>
+    </p>
 </h4>
 *<center>📣 Long time no see! 👋 Architecture upgrade, performance optimization and more new features will come in July and August, stay tune! 🥂</center>*
 ## News or Update
-**To experience adapter training using `auto_gptq` quantized model in advance, you can try [this branch](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration) and discuss [in here](https://github.com/PanQiWei/AutoGPTQ/issues/103), examples are [in here](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration/examples/peft).**
+- 2023-07-26 - (Update) - An elegant [PPL benchmark script](examples/benchmark/perplexity.py) to get results that can be fairly compared with other libraries such as `llama.cpp`.
-
+- 2023-06-05 - (Update) - Integrate with 🤗 peft to use gptq quantized model to train adapters, support LoRA, AdaLoRA, AdaptionPrompt, etc.
 - 2023-05-25 - (In Progress) - Integrate with 🤗 peft to use gptq quantized model to train adapters, support LoRA, AdaLoRA, AdaptionPrompt, etc.
 - 2023-05-30 - (Update) - Support download/upload quantized model from/to 🤗 Hub.
 - 2023-05-27 - (Update) - Support quantization and inference for `gpt_bigcode`, `codegen` and `RefineWeb/RefineWebModel`(falcon) model types.
 - 2023-05-04 - (Update) - Support using faster cuda kernel when `not desc_act or group_size == -1`.
@ -69,11 +70,7 @@ And to make sure `autogptq_cuda` is not ever in your virtual environment, run:
 ```shell
 pip uninstall autogptq_cuda -y
 ```
-#### to support LLaMa model
+
 For some people want to try LLaMa and whose `transformers` version not meet the newest one that supports it, using:
 ```shell
 pip install auto-gptq[llama]
 ```
 #### to support triton speedup
 To integrate with `triton`, using:
 > warning: currently triton only supports linux; 3-bit quantization is not supported when using triton
@ -96,8 +93,6 @@ pip install .
 ```
 Like quick installation, you can also set `BUILD_CUDA_EXT=0` to disable pytorch extension building.
 Use `.[llama]` if you want to try LLaMa model.
 Use `.[triton]` if you want to integrate with triton and it's available on your operating system.
 </details>
@ -304,18 +299,18 @@ print(
 > 
 > for example, model_type of `WizardLM`, `vicuna` and `gpt4all` are all `llama`, hence they are all supported by `auto_gptq`.
-| model type                         | quantization | inference | peft-lora | peft-adaption_prompt |
+| model type                         | quantization | inference | peft-lora | peft-ada-lora | peft-adaption_prompt                                                                            |
-|------------------------------------|--------------|-----------|-----------|----------------------|
+|------------------------------------|--------------|-----------|-----------|---------------|-------------------------------------------------------------------------------------------------|
-| bloom                              | ✅            | ✅         |           |                      |
+| bloom                              | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
-| gpt2                               | ✅            | ✅         |           |                      |
+| gpt2                               | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
-| gpt_neox                           | ✅            | ✅         |           |                      |
+| gpt_neox                           | ✅            | ✅         | ✅         | ✅             | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
-| gptj                               | ✅            | ✅         |           |                      |
+| gptj                               | ✅            | ✅         | ✅         | ✅             | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
-| llama                              | ✅            | ✅         |           | ✅                    |
+| llama                              | ✅            | ✅         | ✅         | ✅             | ✅                                                                                               |
-| moss                               | ✅            | ✅         |           |                      |
+| moss                               | ✅            | ✅         | ✅         | ✅             | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
-| opt                                | ✅            | ✅         |           |                      |
+| opt                                | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
-| gpt_bigcode                        | ✅            | ✅         |           |                      |
+| gpt_bigcode                        | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
-| codegen                            | ✅            | ✅         |           |                      |
+| codegen                            | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
-| falcon(RefinedWebModel/RefinedWeb) | ✅            | ✅         |           |                      |
+| falcon(RefinedWebModel/RefinedWeb) | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
 ## Supported Evaluation Tasks
 Currently, `auto_gptq` supports: `LanguageModelingTask`, `SequenceClassificationTask` and `TextSummarizationTask`; more Tasks will come soon!
--- a/README_zh.md
+++ b/README_zh.md
@ -12,14 +12,15 @@
    <p>
        <a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README.md">English</a> |
        <b>中文</b>
-    <p>
+    </p>
 </h4>
 *<center>📣 好久不见！👋 七月和八月将会迎来架构升级，性能优化和新特性，敬请关注！🥂</center>*
 ## 新闻或更新
-**提前体验使用 `auto_gptq` 量化过的模型来训练适应层，你可以尝试[这个分支](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration) 并在[这里](https://github.com/PanQiWei/AutoGPTQ/issues/103)进行讨论，你也可以参考[这里](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration/examples/peft)所提供的示例脚本。**
+- 2023-07-26 - (更新) - 一个优雅的 [PPL 测评脚本](examples/benchmark/perplexity.py)以获得可以与诸如 `llama.cpp` 等代码库进行公平比较的结果。
-
+- 2023-06-05 - (更新) - 集成 🤗 peft 来使用 gptq 量化过的模型训练适应层，支持 LoRA，AdaLoRA，AdaptionPrompt 等。
 - 2023-05-25 - (开发中) - 集成 🤗 peft 来使用 gptq 量化过的模型训练适应层，支持 LoRA，AdaLoRA，AdaptionPrompt 等。
 - 2023-05-30 - (更新) - 支持从 🤗 Hub 下载量化好的模型或上次量化好的模型到 🤗 Hub。
 - 2023-05-27 - (更新) - 支持以下模型的量化和推理： `gpt_bigcode`， `codegen` 以及 `RefineWeb/RefineWebModel`（falcon）。
 - 2023-05-04 - (更新) - 支持在 `not desc_act or group_size == -1` 的情况下使用更快的 cuda 算子。
@ -69,11 +70,7 @@ BUILD_CUDA_EXT=0 pip install auto-gptq
 ```shell
 pip uninstall autogptq_cuda -y
 ```
-#### 支持使用 LLaMa 模型
+
 若想要尝试 LLaMa 模型，但 `transformers` 版本不为支持该模型的最新版本，使用以下命令：
 ```shell
 pip install auto-gptq[llama]
 ```
 #### 支持使用 triton 加速
 若想使用 `triton` 加速模型推理，使用以下命令：
 > 警告：目前 triton 仅支持 linux 操作系统；当使用 triton 时 3-bit 数值类型的量化将不被支持
@ -96,8 +93,6 @@ pip install .
 ```
 正如在快速安装一节，你可以使用 `BUILD_CUDA_EXT=0` 来取消构建 cuda 拓展。
 如果你想要使用 LLaMa 模型，请使用 `.[llama]`。
 如果你想要使用 triton 加速且其能够被你的操作系统所支持，请使用 `.[triton]`。
 </details>
@ -303,18 +298,18 @@ print(
 > 
 > 比如， `WizardLM`，`vicuna` 和 `gpt4all` 模型的 `model_type` 皆为 `llama`， 因此这些模型皆被 `auto_gptq` 所支持。
-| model type                         | quantization | inference | peft-lora | peft-adaption_prompt |
+| model type                         | quantization | inference | peft-lora | peft-ada-lora | peft-adaption_prompt                                                              |
-|------------------------------------|--------------|-----------|-----------|----------------------|
+|------------------------------------|--------------|-----------|-----------|---------------|-----------------------------------------------------------------------------------|
-| bloom                              | ✅            | ✅         |           |                      |
+| bloom                              | ✅            | ✅         | ✅         | ✅             |                                                                                   |
-| gpt2                               | ✅            | ✅         |           |                      |
+| gpt2                               | ✅            | ✅         | ✅         | ✅             |                                                                                   |
-| gpt_neox                           | ✅            | ✅         |           |                      |
+| gpt_neox                           | ✅            | ✅         | ✅         | ✅             | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
-| gptj                               | ✅            | ✅         |           |                      |
+| gptj                               | ✅            | ✅         | ✅         | ✅             | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
-| llama                              | ✅            | ✅         |           | ✅                    |
+| llama                              | ✅            | ✅         | ✅         | ✅             | ✅                                                                                 |
-| moss                               | ✅            | ✅         |           |                      |
+| moss                               | ✅            | ✅         | ✅         | ✅             | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
-| opt                                | ✅            | ✅         |           |                      |
+| opt                                | ✅            | ✅         | ✅         | ✅             |                                                                                   |
-| gpt_bigcode                        | ✅            | ✅         |           |                      |
+| gpt_bigcode                        | ✅            | ✅         | ✅         | ✅             |                                                                                   |
-| codegen                            | ✅            | ✅         |           |                      |
+| codegen                            | ✅            | ✅         | ✅         | ✅             |                                                                                   |
-| falcon(RefinedWebModel/RefinedWeb) | ✅            | ✅         |           |                      |
+| falcon(RefinedWebModel/RefinedWeb) | ✅            | ✅         | ✅         | ✅             |                                                                                   |
 ## 支持的评估任务
 目前， `auto_gptq` 支持以下评估任务： `LanguageModelingTask`, `SequenceClassificationTask` 和 `TextSummarizationTask`；更多的评估任务即将到来！
--- a/auto_gptq/init.py
+++ b/auto_gptq/init.py
@ -1,2 +1,4 @@
 __version__ = "0.3.2"
 from .modeling import BaseQuantizeConfig
 from .modeling import AutoGPTQForCausalLM
 from .utils.peft_utils import get_gptq_peft_model
--- a/auto_gptq/modeling/init.py
+++ b/auto_gptq/modeling/init.py
@ -10,4 +10,6 @@ from .opt import *
 from .rw import *
 from .gpt_bigcode import *
 from .codegen import *
 from .baichuan import *
 from .internlm import *
 from .mpt import *
--- a/auto_gptq/modeling/_base.py
+++ b/auto_gptq/modeling/_base.py
@ -20,10 +20,11 @@ from transformers.modeling_utils import no_init_weights
 from ._const import *
 from ._utils import *
 from ..nn_modules.qlinear import GeneralQuantLinear
 from ..nn_modules._fused_base import FusedBaseAttentionModule, FusedBaseMLPModule
 from ..quantization import GPTQ
 from ..utils.data_utils import collate_data
-from ..utils.import_utils import TRITON_AVAILABLE
+from ..utils.import_utils import dynamically_import_QuantLinear, TRITON_AVAILABLE, AUTOGPTQ_CUDA_AVAILABLE
 logger = getLogger(__name__)
@ -112,7 +113,16 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
    fused_attn_module_type: Optional[FusedBaseAttentionModule] = None
    fused_mlp_module_type: Optional[FusedBaseMLPModule] = None
-    def __init__(self, model: PreTrainedModel, quantized: bool, quantize_config: BaseQuantizeConfig):
+    def __init__(
        self,
        model: PreTrainedModel,
        quantized: bool,
        quantize_config: BaseQuantizeConfig,
        is_triton_backend: bool = False,
        injected_fused_attention: bool = False,
        injected_fused_mlp: bool = False,
        trainable: bool = False
    ):
        super().__init__()
        self.model = model
@ -121,6 +131,11 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        self.quantize_config = quantize_config
        self.config = self.model.config
        self.is_triton_backend = is_triton_backend
        self.injected_fused_attention = injected_fused_attention
        self.injected_fused_mlp = injected_fused_mlp
        self.trainable = trainable
    @property
    def quantized(self):
        return self._quantized
@ -431,6 +446,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        repo_id: str,
        save_dir: Optional[str] = None,
        use_safetensors: Optional[bool] = True,
        safetensors_metadata: Optional[Dict[str, str]] = None,
        commit_message: Optional[str] = "Upload of AutoGPTQ quantized model",
        use_auth_token: Optional[Union[bool, str]] = None,
        private: Optional[bool] = None,
@ -450,6 +466,10 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
            use_safetensors (`bool`, *optional*):
                Save the model using `safetensors`.
                If the model has already been saved, this parameter can be omitted.
            safetensors_metadata: (`dict`, *optional*, defaults to `None`):
                Pass optional metadata dictionary to be saved in the `safetensors` model file(s).
                Metadata is optional and is purely for informational purposes. It does not affect inference.
                If `None`, no metadata will be saved.
            commit_message (`str`, *optional*, defaults to `"Upload tool"`):
                Message to commit while pushing.
            use_auth_token (`bool` or `str`, *optional*):
@ -469,7 +489,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        if save_dir is not None:
            logger.info(f"Saving model to {save_dir}")
-            self.save_quantized(save_dir, use_safetensors)
+            self.save_quantized(save_dir, use_safetensors, safetensors_metadata)
        repo_url = create_repo(
            repo_id=repo_id, token=token, private=private, exist_ok=True, repo_type="model"
@ -492,7 +512,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
                repo_type="model",
            )
-    def save_quantized(self, save_dir: str, use_safetensors: bool = False):
+    def save_quantized(self, save_dir: str, use_safetensors: bool = False, safetensors_metadata: Optional[Dict[str, str]] = None):
        """save quantized model and configs to local disk"""
        os.makedirs(save_dir, exist_ok=True)
@ -506,7 +526,42 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
            model_save_name = model_base_name + ".safetensors"
            state_dict = self.model.state_dict()
            state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()}
-            safe_save(state_dict, join(save_dir, model_save_name))
+            if safetensors_metadata is None:
                safetensors_metadata = {}
            elif not isinstance(safetensors_metadata, dict):
                raise TypeError("safetensors_metadata must be a dictionary.")
            else:
                logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
                new_safetensors_metadata = {}
                converted_keys = False
                for key, value in safetensors_metadata.items():
                    if not isinstance(key, str) or not isinstance(value, str):
                        converted_keys = True
                        try:
                            new_key = str(key)
                            new_value = str(value)
                        except Exception as e:
                            raise TypeError(f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}")
                        if new_key in new_safetensors_metadata:
                            logger.warning(f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting.")
                        new_safetensors_metadata[new_key] = new_value
                safetensors_metadata = new_safetensors_metadata
                if converted_keys:
                    logger.debug(f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}")
            # Format is required to enable Accelerate to load the metadata
            # otherwise it raises an OSError
            safetensors_metadata['format'] = "pt"
            # Store the quantization configuration as safetensors metadata
            from auto_gptq import __version__
            safetensors_metadata['auto_gptq_version'] = str(__version__)
            safetensors_metadata['gptq_bits'] = str(self.quantize_config.bits)
            safetensors_metadata['gptq_group_size'] = str(self.quantize_config.group_size)
            safetensors_metadata['gptq_desc_act'] = str(self.quantize_config.desc_act)
            safetensors_metadata['gptq_damp_percent'] = str(self.quantize_config.damp_percent)
            safe_save(state_dict, join(save_dir, model_save_name), safetensors_metadata)
        else:
            model_save_name = model_base_name + ".bin"
            torch.save(self.model.state_dict(), join(save_dir, model_save_name))
@ -516,10 +571,10 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        self.quantize_config.model_name_or_path = save_dir
        self.quantize_config.model_file_base_name = model_base_name
-    def save_pretrained(self, save_dir: str, use_safetensors: bool = False, **kwargs):
+    def save_pretrained(self, save_dir: str, use_safetensors: bool = False, safetensors_metadata: Optional[Dict[str, str]] = None, **kwargs):
        """alias of save_quantized"""
        logger.warning("you are using save_pretrained, which will re-direct to save_quantized.")
-        self.save_quantized(save_dir, use_safetensors)
+        self.save_quantized(save_dir, use_safetensors, safetensors_metadata)
    @classmethod
    def from_pretrained(
@ -543,7 +598,29 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        torch.nn.init.uniform_ = skip
        torch.nn.init.normal_ = skip
-        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+        # Parameters related to loading from Hugging Face Hub
        cache_dir = model_init_kwargs.pop("cache_dir", None)
        force_download = model_init_kwargs.pop("force_download", False)
        resume_download = model_init_kwargs.pop("resume_download", False)
        proxies = model_init_kwargs.pop("proxies", None)
        local_files_only = model_init_kwargs.pop("local_files_only", False)
        use_auth_token = model_init_kwargs.pop("use_auth_token", None)
        revision = model_init_kwargs.pop("revision", None)
        subfolder = model_init_kwargs.pop("subfolder", "")
        commit_hash = model_init_kwargs.pop("_commit_hash", None)
        cached_file_kwargs = {
            "cache_dir": cache_dir,
            "force_download": force_download,
            "proxies": proxies,
            "resume_download": resume_download,
            "local_files_only": local_files_only,
            "use_auth_token": use_auth_token,
            "revision": revision,
            "subfolder": subfolder,
        }
        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True, **cached_file_kwargs)
        if config.model_type not in SUPPORTED_MODELS:
            raise TypeError(f"{config.model_type} isn't supported yet.")
@ -579,7 +656,9 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        torch.cuda.empty_cache()
-        model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **model_init_kwargs)
+        merged_kwargs = {**model_init_kwargs, **cached_file_kwargs}
        model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **merged_kwargs)
        model_config = model.config.to_dict()
        seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
        if any([k in model_config for k in seq_len_keys]):
@ -597,8 +676,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
    @classmethod
    def from_quantized(
        cls,
-        model_name_or_path: Optional[str] = None,
+        model_name_or_path: Optional[str],
        save_dir: Optional[str] = None,
        device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
        max_memory: Optional[dict] = None,
        device: Optional[Union[str, int]] = None,
@ -613,6 +691,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        use_safetensors: bool = False,
        trust_remote_code: bool = False,
        warmup_triton: bool = False,
        trainable: bool = False,
        **kwargs
    ):
        """load quantized model from local disk"""
@ -628,20 +707,25 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        subfolder = kwargs.pop("subfolder", "")
        commit_hash = kwargs.pop("_commit_hash", None)
        cached_file_kwargs = {
            "cache_dir": cache_dir,
            "force_download": force_download,
            "proxies": proxies,
            "resume_download": resume_download,
            "local_files_only": local_files_only,
            "use_auth_token": use_auth_token,
            "revision": revision,
            "subfolder": subfolder,
            "_raise_exceptions_for_missing_entries": False,
            "_commit_hash": commit_hash,
        }
        if use_triton and not TRITON_AVAILABLE:
            logger.warning("triton is not installed, reset use_triton to False")
            use_triton = False
        # == step1: prepare configs and file names == #
-        if model_name_or_path and save_dir:
+        config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code, **cached_file_kwargs)
            logger.warning("save_dir will be ignored because model_name_or_path is explicit specified.")
        if not model_name_or_path and save_dir:
            model_name_or_path = save_dir
            warnings.warn("save_dir is deprecated and will be removed in version 0.3.0", PendingDeprecationWarning, stacklevel=2)
        if not model_name_or_path and not save_dir:
            raise ValueError("at least one of model_name_or_path or save_dir should be specified.")
        config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
        if config.model_type not in SUPPORTED_MODELS:
            raise TypeError(f"{config.model_type} isn't supported yet.")
@ -670,25 +754,11 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        resolved_archive_file = None
        if is_local:
            model_save_name = join(model_name_or_path, model_basename)
            for ext in extensions:
                if isfile(model_save_name + ext):
                    resolved_archive_file = model_save_name + ext
                    break
        else: # remote
            cached_file_kwargs = {
                "cache_dir": cache_dir,
                "force_download": force_download,
                "proxies": proxies,
                "resume_download": resume_download,
                "local_files_only": local_files_only,
                "use_auth_token": use_auth_token,
                "revision": revision,
                "subfolder": subfolder,
                "_raise_exceptions_for_missing_entries": False,
                "_commit_hash": commit_hash,
            }
            for ext in extensions:
                resolved_archive_file = cached_file(model_name_or_path, model_basename + ext, **cached_file_kwargs)
                if resolved_archive_file is not None:
@ -699,6 +769,9 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        model_save_name = resolved_archive_file
        if not use_triton and trainable:
            logger.warning("QuantLinear with cuda backend not support trainable mode yet, Switch to the pytorch backend.")
        # == step2: convert model to gptq-model (replace Linear with QuantLinear) == #
        def skip(*args, **kwargs):
            pass
@ -734,7 +807,8 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
                quantize_config.group_size,
                use_triton=use_triton,
                use_cuda_fp16=use_cuda_fp16,
-                desc_act=quantize_config.desc_act
+                desc_act=quantize_config.desc_act,
                trainable=trainable
            )
            model.tie_weights()
@ -794,6 +868,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        # == step5: (optional) inject optimized module == #
        if inject_fused_attention:
            if cls.fused_attn_module_type is None:
                inject_fused_attention = False
                logger.warning(f"{cls.__name__} hasn't fused attention module yet, will skip inject fused attention.")
            else:
                cls.fused_attn_module_type.inject_to_model(
@ -801,10 +876,12 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
                    use_triton=use_triton,
                    group_size=quantize_config.group_size,
                    use_cuda_fp16=use_cuda_fp16,
-                    desc_act=quantize_config.desc_act
+                    desc_act=quantize_config.desc_act,
                    trainable=trainable
                )
        if inject_fused_mlp:
            if cls.fused_mlp_module_type is None:
                inject_fused_mlp = False
                logger.warning(f"{cls.__name__} hasn't fused mlp module yet, will skip inject fused mlp.")
            else:
                cls.fused_mlp_module_type.inject_to_model(
@ -815,13 +892,26 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        model.eval()
        # == step6: (optional) warmup triton == #
        if use_triton and warmup_triton:
-            from ..nn_modules.qlinear_triton import QuantLinear
+            from ..nn_modules.qlinear.qlinear_triton import QuantLinear
            QuantLinear.warmup(model, seqlen=model.seqlen)
            if inject_fused_mlp and cls.fused_mlp_module_type is not None:
                cls.fused_mlp_module_type.warmup(model, seqlen=model.seqlen)
-        return cls(model, True, quantize_config)
+        # == step7: make model compatible with peft
        cls.make_sure_compatible_with_peft(
            model, use_triton, quantize_config.desc_act, quantize_config.group_size
        )
        return cls(
            model,
            True,
            quantize_config,
            is_triton_backend=use_triton,
            injected_fused_attention=inject_fused_attention,
            injected_fused_mlp=inject_fused_mlp and use_triton,
            trainable=trainable
        )
    def warmup_triton(self, enabled: bool = True):
        if not enabled:
@ -830,11 +920,34 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
            logger.warning(f"triton is not available, skip warmup stage directly.")
            return
-        from ..nn_modules.qlinear_triton import QuantLinear
+        from ..nn_modules.qlinear.qlinear_triton import QuantLinear
        QuantLinear.warmup(self.model, seqlen=self.model.seqlen)
        if self.fused_mlp_module_type is not None:
            self.fused_mlp_module_type.warmup(self.model, seqlen=self.model.seqlen)
    def enable_trainable_mode(self, enabled: bool = True):
        if not self.is_triton_backend and enabled:
            raise NotImplementedError("For now, trainable mode only supports triton backend.")
        for n, m in self.model.named_modules():
            if hasattr(m, "trainable"):
                setattr(m, "trainable", enabled)
    def disable_trainable_mode(self):
        self.enable_trainable_mode(enabled=False)
    @staticmethod
    def make_sure_compatible_with_peft(model: PreTrainedModel, use_triton: bool, desc_act: bool, group_size: int):
        GeneralQuantLinear.inject_to_model(
            model,
            dynamically_import_QuantLinear(use_triton, desc_act, group_size)
        )
    def __getattr__(self, item):
        try:
            return super().__getattr__(item)
        except:
            return getattr(self.model, item)
 __all__ = ["BaseGPTQForCausalLM", "BaseQuantizeConfig"]
--- a/auto_gptq/modeling/_const.py
+++ b/auto_gptq/modeling/_const.py
@ -1,12 +1,27 @@
 from packaging.version import parse as parse_version
 from torch import device
-from transformers import __version__ as transformers_version
+
 from ..utils.import_utils import compare_transformers_version
 CPU = device("cpu")
 CUDA_0 = device("cuda:0")
-SUPPORTED_MODELS = ["bloom", "gptj", "gpt2", "gpt_neox", "opt", "moss", "gpt_bigcode", "codegen", "RefinedWebModel", "RefinedWeb", "mpt"]
+SUPPORTED_MODELS = [
    "bloom",
    "gptj",
    "gpt2",
    "gpt_neox",
    "opt",
    "moss",
    "gpt_bigcode",
    "codegen",
    "RefinedWebModel",
    "RefinedWeb",
    "baichuan",
    "internlm",
    "mpt",
 ]
 if compare_transformers_version("v4.28.0", op="ge"):
    SUPPORTED_MODELS.append("llama")
--- a/auto_gptq/modeling/_utils.py
+++ b/auto_gptq/modeling/_utils.py
@ -50,7 +50,17 @@ def get_module_by_name_suffix(model, module_name: str):
            return module
-def make_quant(module, names, bits, group_size, name='', use_triton=False, use_cuda_fp16=True, desc_act=False):
+def make_quant(
    module,
    names,
    bits,
    group_size,
    name='',
    use_triton=False,
    use_cuda_fp16=True,
    desc_act=False,
    trainable=False
 ):
    QuantLinear = dynamically_import_QuantLinear(use_triton=use_triton, desc_act=desc_act, group_size=group_size)
    if isinstance(module, QuantLinear):
@ -71,13 +81,25 @@ def make_quant(module, names, bits, group_size, name='', use_triton=False, use_c
                in_features = tmp.weight.shape[0]
                out_features = tmp.weight.shape[1]
            if (not(desc_act) or group_size == -1) and not use_triton:
-                new_layer = QuantLinear(bits, group_size, in_features, out_features, True, use_cuda_fp16=use_cuda_fp16)
+                new_layer = QuantLinear(
                    bits, group_size, in_features, out_features, True, use_cuda_fp16=use_cuda_fp16, trainable=trainable
                )
            else:
-                new_layer = QuantLinear(bits, group_size, in_features, out_features, True)
+                new_layer = QuantLinear(bits, group_size, in_features, out_features, True, trainable=trainable)
            new_layer.device = ori_layer_device
            setattr(module, attr, new_layer.to(ori_layer_device))
    for name1, child in module.named_children():
-        make_quant(child, names, bits, group_size, name + '.' + name1 if name != '' else name1, use_triton=use_triton, use_cuda_fp16=use_cuda_fp16,desc_act=desc_act)
+        make_quant(
            child,
            names,
            bits,
            group_size,
            name + '.' + name1 if name != '' else name1,
            use_triton=use_triton,
            use_cuda_fp16=use_cuda_fp16,
            desc_act=desc_act,
            trainable=trainable
        )
 def pack_model(
--- a/auto_gptq/modeling/auto.py
+++ b/auto_gptq/modeling/auto.py
@ -1,4 +1,5 @@
-from typing import Optional
+from inspect import signature
 from typing import Dict, Optional, Union
 from ._base import BaseQuantizeConfig, BaseGPTQForCausalLM
 from ._utils import check_and_get_model_type
@ -12,6 +13,8 @@ from .moss import MOSSGPTQForCausalLM
 from .opt import OPTGPTQForCausalLM
 from .rw import RWGPTQForCausalLM
 from .gpt_bigcode import GPTBigCodeGPTQForCausalLM
 from .baichuan import BaiChuanGPTQForCausalLM
 from .internlm import InternLMGPTQForCausalLM
 from .mpt import MPTGPTQForCausalLM
@ -26,8 +29,10 @@ GPTQ_CAUSAL_LM_MODEL_MAP = {
    "gpt_bigcode": GPTBigCodeGPTQForCausalLM,
    "codegen": CodeGenGPTQForCausalLM,
    "RefinedWebModel": RWGPTQForCausalLM,
-    "RefinedWeb":RWGPTQForCausalLM,
+    "RefinedWeb": RWGPTQForCausalLM,
-    "mpt": MPTGPTQForCausalLM
+    "baichuan": BaiChuanGPTQForCausalLM,
    "internlm": InternLMGPTQForCausalLM,
    "mpt": MPTGPTQForCausalLM,
 }
@ -48,7 +53,9 @@ class AutoGPTQForCausalLM:
        trust_remote_code: bool = False,
        **model_init_kwargs
    ) -> BaseGPTQForCausalLM:
-        model_type = check_and_get_model_type(pretrained_model_name_or_path, trust_remote_code)
+        model_type = check_and_get_model_type(
            pretrained_model_name_or_path, trust_remote_code
        )
        return GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_pretrained(
            pretrained_model_name_or_path=pretrained_model_name_or_path,
            quantize_config=quantize_config,
@ -60,8 +67,7 @@ class AutoGPTQForCausalLM:
    @classmethod
    def from_quantized(
        cls,
-        model_name_or_path: Optional[str] = None,
+        model_name_or_path: Optional[str],
        save_dir: Optional[str] = None,
        device_map: Optional[Union[str, Dict[str, Union[str, int]]]] = None,
        max_memory: Optional[dict] = None,
        device: Optional[Union[str, int]] = None,
@ -75,14 +81,32 @@ class AutoGPTQForCausalLM:
        use_safetensors: bool = False,
        trust_remote_code: bool = False,
        warmup_triton: bool = False,
        trainable: bool = False,
        **kwargs
    ) -> BaseGPTQForCausalLM:
-        model_type = check_and_get_model_type(save_dir or model_name_or_path, trust_remote_code)
+        model_type = check_and_get_model_type(model_name_or_path, trust_remote_code)
        quant_func = GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized
-        keywords = {key: kwargs[key] for key in signature(quant_func).parameters if key in kwargs}
+        # A static list of kwargs needed for huggingface_hub
        huggingface_kwargs = [
            "cache_dir",
            "force_download",
            "proxies",
            "resume_download",
            "local_files_only",
            "use_auth_token",
            "revision",
            "subfolder",
            "_raise_exceptions_for_missing_entries",
            "_commit_hash"
        ]
        # TODO: do we need this filtering of kwargs? @PanQiWei is there a reason we can't just pass all kwargs?
        keywords = {
            key: kwargs[key]
            for key in list(signature(quant_func).parameters.keys()) + huggingface_kwargs
            if key in kwargs
        }
        return quant_func(
            model_name_or_path=model_name_or_path,
            save_dir=save_dir,
            device_map=device_map,
            max_memory=max_memory,
            device=device,
@ -96,6 +120,7 @@ class AutoGPTQForCausalLM:
            use_safetensors=use_safetensors,
            trust_remote_code=trust_remote_code,
            warmup_triton=warmup_triton,
            trainable=trainable,
            **keywords
        )
--- a/auto_gptq/modeling/baichuan.py
+++ b/auto_gptq/modeling/baichuan.py
@ -0,0 +1,16 @@
 from ._base import *
 class BaiChuanGPTQForCausalLM(BaseGPTQForCausalLM):
    layer_type = "DecoderLayer"
    layers_block_name = "model.layers"
    outside_layer_modules = ["model.embed_tokens", "model.norm"]
    inside_layer_modules = [
        ["self_attn.W_pack"],
        ["self_attn.o_proj"],
        ["mlp.up_proj", "mlp.gate_proj"],
        ["mlp.down_proj"]
    ]
 __all__ = ["BaiChuanGPTQForCausalLM"]
--- a/auto_gptq/modeling/internlm.py
+++ b/auto_gptq/modeling/internlm.py
@ -0,0 +1,16 @@
 from ._base import *
 class InternLMGPTQForCausalLM(BaseGPTQForCausalLM):
    layer_type = "InternLMDecoderLayer"
    layers_block_name = "model.layers"
    outside_layer_modules = ["model.embed_tokens", "model.norm"]
    inside_layer_modules = [
        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
        ["self_attn.o_proj"],
        ["mlp.up_proj", "mlp.gate_proj"],
        ["mlp.down_proj"],
    ]
 __all__ = ["InternLMGPTQForCausalLM"]
--- a/auto_gptq/nn_modules/_fused_base.py
+++ b/auto_gptq/nn_modules/_fused_base.py
@ -18,7 +18,16 @@ class FusedBaseModule(nn.Module, TritonModuleMixin):
 class FusedBaseAttentionModule(FusedBaseModule):
    @classmethod
    @abstractmethod
-    def inject_to_model(cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, **kwargs):
+    def inject_to_model(
        cls,
        model,
        use_triton=False,
        group_size=-1,
        use_cuda_fp16=True,
        desc_act=False,
        trainable=False,
        **kwargs
    ):
        raise NotImplementedError()
    @classmethod
--- a/auto_gptq/nn_modules/fused_gptj_attn.py
+++ b/auto_gptq/nn_modules/fused_gptj_attn.py
@ -226,7 +226,16 @@ class FusedGPTJAttentionForQuantizedModel(FusedBaseAttentionModule):
        return outputs  # a, present, (attentions)
    @classmethod
-    def inject_to_model(cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, **kwargs):
+    def inject_to_model(
        cls,
        model,
        use_triton=False,
        group_size=-1,
        use_cuda_fp16=True,
        desc_act=False,
        trainable=False,
        **kwargs
    ):
        config = model.config
        QuantLinear = dynamically_import_QuantLinear(use_triton=use_triton, desc_act=desc_act, group_size=group_size)
@ -253,7 +262,7 @@ class FusedGPTJAttentionForQuantizedModel(FusedBaseAttentionModule):
                q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
                True if q_proj.bias is not None else False,
            )
-            qlinear_kwargs = dict()
+            qlinear_kwargs = {"trainable": trainable}
            if (not desc_act or group_size == -1) and not use_triton:
                qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
            qkv_proj = QuantLinear(*qlinear_args, **qlinear_kwargs)
--- a/auto_gptq/nn_modules/fused_llama_attn.py
+++ b/auto_gptq/nn_modules/fused_llama_attn.py
@ -126,7 +126,16 @@ class FusedLlamaAttentionForQuantizedModel(FusedBaseAttentionModule):
        return attn_output, attn_weights, past_key_value
    @classmethod
-    def inject_to_model(cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, **kwargs):
+    def inject_to_model(
        cls,
        model,
        use_triton=False,
        group_size=-1,
        use_cuda_fp16=True,
        desc_act=False,
        trainable=False,
        **kwargs
    ):
        """
        Replace all LlamaAttention modules with QuantLlamaAttention modules, fusing the q, k, v projections.
        """
@ -153,7 +162,7 @@ class FusedLlamaAttentionForQuantizedModel(FusedBaseAttentionModule):
                q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
                True if q_proj.bias is not None else False,
            )
-            qlinear_kwargs = dict()
+            qlinear_kwargs = {"trainable": trainable}
            if (not desc_act or group_size == -1) and not use_triton:
                qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
            qkv_layer = QuantLinear(*qlinear_args, **qlinear_kwargs)
--- a/auto_gptq/nn_modules/fused_llama_mlp.py
+++ b/auto_gptq/nn_modules/fused_llama_mlp.py
@ -237,14 +237,6 @@ class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
        up_proj,
    ):
        super().__init__()
        self.register_buffer('gate_proj_qweight', gate_proj.qweight)
        self.register_buffer('gate_proj_scales', gate_proj.scales)
        self.register_buffer('gate_proj_qzeros', gate_proj.qzeros)
        self.register_buffer('gate_proj_g_idx', gate_proj.g_idx)
        self.register_buffer('up_proj_qweight', up_proj.qweight)
        self.register_buffer('up_proj_scales', up_proj.scales)
        self.register_buffer('up_proj_qzeros', up_proj.qzeros)
        self.register_buffer('up_proj_g_idx', up_proj.g_idx)
        self.infeatures = gate_proj.infeatures
        self.intermediate_size = gate_proj.outfeatures
@ -252,6 +244,8 @@ class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
        self.bits = gate_proj.bits
        self.maxq = gate_proj.maxq
        self.gate_proj = gate_proj
        self.up_proj = up_proj
        self.down_proj = down_proj
    def forward(self, x):
@ -266,40 +260,20 @@ class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
            c = torch.empty((M, N), device=x.device, dtype=torch.float16)
            grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), )
            quant_fused_matmul_248_kernel[grid](
-                x, c, self.gate_proj_qweight,
+                x, c, self.gate_proj.qweight,
-                self.gate_proj_scales, self.gate_proj_qzeros, self.gate_proj_g_idx,
+                self.gate_proj.scales, self.gate_proj.qzeros, self.gate_proj.g_idx,
-                self.up_proj_qweight,
+                self.up_proj.qweight,
-                self.up_proj_scales, self.up_proj_qzeros, self.up_proj_g_idx,
+                self.up_proj.scales, self.up_proj.qzeros, self.up_proj.g_idx,
                M, N, K,
                self.bits, self.maxq,
                x.stride(0), x.stride(1),
-                self.gate_proj_qweight.stride(0), self.gate_proj_qweight.stride(1),
+                self.gate_proj.qweight.stride(0), self.gate_proj.qweight.stride(1),
                c.stride(0), c.stride(1),
-                self.gate_proj_scales.stride(0), self.gate_proj_qzeros.stride(0)
+                self.gate_proj.scales.stride(0), self.gate_proj.qzeros.stride(0)
            )
            c = c.reshape(out_shape)
            return c
    def fused2cuda(self):
        self.gate_proj_qweight = self.gate_proj_qweight.cuda()
        self.gate_proj_scales = self.gate_proj_scales.cuda()
        self.gate_proj_qzeros = self.gate_proj_qzeros.cuda()
        self.gate_proj_g_idx = self.gate_proj_g_idx.cuda()
        self.up_proj_qweight = self.up_proj_qweight.cuda()
        self.up_proj_scales = self.up_proj_scales.cuda()
        self.up_proj_qzeros = self.up_proj_qzeros.cuda()
        self.up_proj_g_idx = self.up_proj_g_idx.cuda()
    def fused2cpu(self):
        self.gate_proj_qweight = self.gate_proj_qweight.cpu()
        self.gate_proj_scales = self.gate_proj_scales.cpu()
        self.gate_proj_qzeros = self.gate_proj_qzeros.cpu()
        self.gate_proj_g_idx = self.gate_proj_g_idx.cpu()
        self.up_proj_qweight = self.up_proj_qweight.cpu()
        self.up_proj_scales = self.up_proj_scales.cpu()
        self.up_proj_qzeros = self.up_proj_qzeros.cpu()
        self.up_proj_g_idx = self.up_proj_g_idx.cpu()
    @classmethod
    def inject_to_model(cls, model, use_triton=False, **kwargs):
        if not use_triton:
--- a/auto_gptq/nn_modules/qlinear/init.py
+++ b/auto_gptq/nn_modules/qlinear/init.py
@ -0,0 +1,57 @@
 import torch.nn as nn
 class GeneralQuantLinear(nn.Linear):
    def __init__(self, quant_linear_module):
        super().__init__(
            in_features=quant_linear_module.infeatures,
            out_features=quant_linear_module.outfeatures,
            bias=True
        )
        self.infeatures = quant_linear_module.infeatures
        self.outfeatures = quant_linear_module.outfeatures
        self.bits = quant_linear_module.bits
        self.group_size = quant_linear_module.group_size
        self.maxq = quant_linear_module.maxq
        self.weight.requires_grad = False
        self.weight.data = quant_linear_module.qweight
        self.qweight = self.weight
        self.bias.data = quant_linear_module.bias
        self.qweight.requires_grad = False
        self.bias.requires_grad = False
        self.qzeros = quant_linear_module.qzeros
        self.scales = quant_linear_module.scales
        self.g_idx = quant_linear_module.g_idx
        if hasattr(quant_linear_module, "wf"):
            self.wf = quant_linear_module.wf
        if hasattr(quant_linear_module, "kernel_switch_threshold"):
            self.kernel_switch_threshold = quant_linear_module.kernel_switch_threshold
        if hasattr(quant_linear_module, "autogptq_cuda_available"):
            self.autogptq_cuda_available = quant_linear_module.autogptq_cuda_available
        self.trainable = quant_linear_module.trainable
        self.forward = quant_linear_module.forward
    @classmethod
    def inject_to_model(cls, model, target_module_type):
        for name, m in model.named_modules():
            if not isinstance(m, target_module_type):
                continue
            new_m = cls(m)
            if '.' in name:
                parent_name = name.rsplit('.', 1)[0]
                child_name = name[len(parent_name) + 1:]
                parent = model.get_submodule(parent_name)
            else:
                parent_name = ''
                parent = model
                child_name = name
            setattr(parent, child_name, new_m)
--- a/auto_gptq/nn_modules/qlinear/qlinear_cuda.py
+++ b/auto_gptq/nn_modules/qlinear/qlinear_cuda.py
@ -9,11 +9,13 @@ import transformers
 logger = getLogger(__name__)
 try:
-    import autogptq_cuda
+    import autogptq_cuda_256
-
+    import autogptq_cuda_64
    _autogptq_cuda_available = True
 except ImportError:
    logger.warning('CUDA extension not installed.')
    autogptq_cuda_256 = None
    autogptq_cuda_64 = None
    _autogptq_cuda_available = False
@ -26,10 +28,14 @@ class QuantLinear(nn.Module):
        outfeatures,
        bias,
        kernel_switch_threshold=128,
        trainable=False
    ):
        super().__init__()
        global _autogptq_cuda_available
        if bits not in [2, 3, 4, 8]:
            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
        if trainable:
            _autogptq_cuda_available = False
        self.infeatures = infeatures
        self.outfeatures = outfeatures
@ -73,9 +79,15 @@ class QuantLinear(nn.Module):
        self.kernel_switch_threshold = kernel_switch_threshold
        self.autogptq_cuda_available = _autogptq_cuda_available
        self.autogptq_cuda = autogptq_cuda_256
        if infeatures % 256 != 0 or outfeatures % 256 != 0:
            self.autogptq_cuda = autogptq_cuda_64
        if infeatures % 64 != 0 or outfeatures % 64 != 0:
            self.autogptq_cuda_available = False
        self.trainable = trainable
    def pack(self, linear, scales, zeros, g_idx=None):
        W = linear.weight.data.clone()
        if isinstance(linear, nn.Conv2d):
@ -184,13 +196,13 @@ class QuantLinear(nn.Module):
        ):
            out = torch.zeros((x.shape[0], self.outfeatures), device=x.device, dtype=torch.float32)
            if self.bits == 2:
-                autogptq_cuda.vecquant2matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
+                self.autogptq_cuda.vecquant2matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
            elif self.bits == 3:
-                autogptq_cuda.vecquant3matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
+                self.autogptq_cuda.vecquant3matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
            elif self.bits == 4:
-                autogptq_cuda.vecquant4matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
+                self.autogptq_cuda.vecquant4matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
            elif self.bits == 8:
-                autogptq_cuda.vecquant8matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
+                self.autogptq_cuda.vecquant8matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
            else:
                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
        else:
--- a/auto_gptq/nn_modules/qlinear/qlinear_cuda_old.py
+++ b/auto_gptq/nn_modules/qlinear/qlinear_cuda_old.py
@ -7,15 +7,17 @@ import torch.nn as nn
 import transformers
 logger = getLogger(__name__)
 try:
-    import autogptq_cuda
+    import autogptq_cuda_256
-
+    import autogptq_cuda_64
    _autogptq_cuda_available = True
 except ImportError:
    logger.warning('CUDA extension not installed.')
    autogptq_cuda_256 = None
    autogptq_cuda_64 = None
    _autogptq_cuda_available = False
 class QuantLinear(nn.Module):
    def __init__(
        self,
@ -25,12 +27,15 @@ class QuantLinear(nn.Module):
        outfeatures,
        bias,
        use_cuda_fp16=True,
-        kernel_switch_threshold=128
+        kernel_switch_threshold=128,
        trainable=False
    ):
        super().__init__()
        global _autogptq_cuda_available
        if bits not in [2, 3, 4, 8]:
            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
        if trainable:
            _autogptq_cuda_available = False
        self.infeatures = infeatures
        self.outfeatures = outfeatures
        self.bits = bits
@ -77,10 +82,21 @@ class QuantLinear(nn.Module):
        self.kernel_switch_threshold = kernel_switch_threshold
        self.autogptq_cuda_available = _autogptq_cuda_available
        self.autogptq_cuda = autogptq_cuda_256
        if infeatures % 256 != 0 or outfeatures % 256 != 0:
            self.autogptq_cuda = autogptq_cuda_64
        if infeatures % 64 != 0 or outfeatures % 64 != 0:
            self.autogptq_cuda_available = False
        self.trainable = trainable
    def pack(self, linear, scales, zeros, g_idx):
        W = linear.weight.data.clone()
        if isinstance(linear, nn.Conv2d):
            W = W.flatten(1)
        if isinstance(linear, transformers.pytorch_utils.Conv1D):
            W = W.t()
        scales = scales.t().contiguous()
        zeros = zeros.t().contiguous()
        scale_zeros = zeros * scales
@ -93,7 +109,7 @@ class QuantLinear(nn.Module):
            g_idx = idx // self.group_size
            intweight.append(
                torch.round(
-                    (linear.weight.data[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]
+                    (W[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]
                ).to(torch.int)[:, None]
            )
        intweight = torch.cat(intweight, dim=1)
@ -182,24 +198,24 @@ class QuantLinear(nn.Module):
            if self.use_cuda_fp16:
                x = x.half()
                if self.bits == 2:
-                    autogptq_cuda.vecquant2matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
+                    self.autogptq_cuda.vecquant2matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
                elif self.bits == 3:
-                    autogptq_cuda.vecquant3matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
+                    self.autogptq_cuda.vecquant3matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
                elif self.bits == 4:
-                    autogptq_cuda.vecquant4matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
+                    self.autogptq_cuda.vecquant4matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
                else:
                    raise NotImplementedError("Only 2,3,4 bits are supported.")
            else:
                x = x.float()
                if self.bits == 2:
-                    autogptq_cuda.vecquant2matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
+                    self.autogptq_cuda.vecquant2matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
                elif self.bits == 3:
-                    autogptq_cuda.vecquant3matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
+                    self.autogptq_cuda.vecquant3matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
                elif self.bits == 4:
-                    autogptq_cuda.vecquant4matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
+                    self.autogptq_cuda.vecquant4matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
                elif self.bits == 8:
-                    autogptq_cuda.vecquant8matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
+                    self.autogptq_cuda.vecquant8matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
                else:
                    raise NotImplementedError("Only 2,3,4,8 bits are supported.")
        else:
--- a/auto_gptq/nn_modules/qlinear/qlinear_triton.py
+++ b/auto_gptq/nn_modules/qlinear/qlinear_triton.py
@ -1,17 +1,20 @@
 import math
 from logging import getLogger
 import numpy as np
 import torch
 import torch.nn as nn
 import transformers
 from torch.cuda.amp import custom_bwd, custom_fwd
 from logging import getLogger
-from .triton_utils.mixin import TritonModuleMixin
+from ..triton_utils.mixin import TritonModuleMixin
 logger = getLogger(__name__)
 try:
-    from .triton_utils.kernels import quant_matmul_248, transpose_quant_matmul_248, QuantLinearFunction
+    from ..triton_utils.kernels import (
        quant_matmul_248, transpose_quant_matmul_248, quant_matmul_inference_only_248,
        QuantLinearFunction, QuantLinearInferenceOnlyFunction
    )
 except ImportError:
    logger.error('triton not installed.')
    raise
@ -24,13 +27,14 @@ class QuantLinear(nn.Module, TritonModuleMixin):
        group_size,
        infeatures,
        outfeatures,
-        bias
+        bias,
        trainable=False
    ):
        super().__init__()
        if bits not in [2, 4, 8]:
            raise NotImplementedError("Only 2,4,8 bits are supported.")
-        if infeatures % 256 != 0 or outfeatures % 256 != 0:
+        if infeatures % 32 != 0 or outfeatures % 32 != 0:
-            raise NotImplementedError("in_feature or out_feature must be divisible by 256.")
+            raise NotImplementedError("in_feature and out_feature must be divisible by 32.")
        self.infeatures = infeatures
        self.outfeatures = outfeatures
        self.bits = bits
@ -58,6 +62,8 @@ class QuantLinear(nn.Module, TritonModuleMixin):
        else:
            self.bias = None
        self.trainable = trainable
    def pack(self, linear, scales, zeros, g_idx=None):
        W = linear.weight.data.clone()
        if isinstance(linear, nn.Conv2d):
@ -122,7 +128,8 @@ class QuantLinear(nn.Module, TritonModuleMixin):
    def forward(self, x):
        out_shape = x.shape[:-1] + (self.outfeatures,)
-        out = QuantLinearFunction.apply(
+        quant_linear_fn = QuantLinearFunction if self.trainable else QuantLinearInferenceOnlyFunction
        out = quant_linear_fn.apply(
            x.reshape(-1, x.shape[-1]),
            self.qweight,
            self.scales,
@ -160,11 +167,14 @@ class QuantLinear(nn.Module, TritonModuleMixin):
            for m in tqdm(range(0, math.ceil(math.log2(seqlen)) + 1)):
                m = 2 ** m
                for (k, n), (qweight, scales, qzeros, g_idx, bits, maxq) in kn_values.items():
                    a = torch.randn(m, k, dtype=torch.float16, device=model.device)
                    quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
                    if transpose:
                        a = torch.randn(m, k, dtype=torch.float16, device=model.device)
                        quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
                        a = torch.randn(m, n, dtype=torch.float16, device=model.device)
                        transpose_quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
                    else:
                        a = torch.randn(m, k, dtype=torch.float16, device=model.device)
                        quant_matmul_inference_only_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
        del kn_values
--- a/auto_gptq/nn_modules/triton_utils/kernels.py
+++ b/auto_gptq/nn_modules/triton_utils/kernels.py
@ -73,27 +73,7 @@ logger = getLogger(__name__)
            },
            num_stages=2,
            num_warps=8
-        ),
+        )
        triton.Config(
            {
                'BLOCK_SIZE_M': 64,
                'BLOCK_SIZE_N': 64,
                'BLOCK_SIZE_K': 64,
                'GROUP_SIZE_M': 8
            },
            num_stages=3,
            num_warps=8
        ),
        triton.Config(
            {
                'BLOCK_SIZE_M': 32,
                'BLOCK_SIZE_N': 32,
                'BLOCK_SIZE_K': 128,
                'GROUP_SIZE_M': 8
            },
            num_stages=2,
            num_warps=4
        ),
    ],
    key=['M', 'N', 'K'],
    nearest_power_of_two=True,
@ -244,27 +224,7 @@ def quant_matmul_248_kernel(
            },
            num_stages=2,
            num_warps=8
-        ),
+        )
        triton.Config(
            {
                'BLOCK_SIZE_M': 64,
                'BLOCK_SIZE_N': 64,
                'BLOCK_SIZE_K': 64,
                'GROUP_SIZE_M': 8
            },
            num_stages=3,
            num_warps=8
        ),
        triton.Config(
            {
                'BLOCK_SIZE_M': 32,
                'BLOCK_SIZE_N': 128,
                'BLOCK_SIZE_K': 32,
                'GROUP_SIZE_M': 8
            },
            num_stages=2,
            num_warps=4
        ),
    ],
    key=['M', 'N', 'K'],
    nearest_power_of_two=True
@ -356,7 +316,6 @@ def silu(x):
    return x * tl.sigmoid(x)
 def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
    with torch.cuda.device(input.device):
        output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=input.dtype)
@ -414,3 +373,30 @@ class QuantLinearFunction(torch.autograd.Function):
        if ctx.needs_input_grad[0]:
            grad_input = transpose_quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
        return grad_input, None, None, None, None, None, None
 def quant_matmul_inference_only_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
    with torch.cuda.device(input.device):
        output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16)
        grid = lambda META: (
            triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(qweight.shape[1], META['BLOCK_SIZE_N']),
        )
        quant_matmul_248_kernel[grid](
            input, qweight, output,
            scales, qzeros, g_idx,
            input.shape[0], qweight.shape[1], input.shape[1],
            bits, maxq,
            input.stride(0), input.stride(1),
            qweight.stride(0), qweight.stride(1),
            output.stride(0), output.stride(1),
            scales.stride(0), qzeros.stride(0)
        )
        return output
 class QuantLinearInferenceOnlyFunction(torch.autograd.Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
        output = quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq)
        return output
--- a/auto_gptq/utils/init.py
+++ b/auto_gptq/utils/init.py
@ -0,0 +1 @@
 from .perplexity_utils import Perplexity
--- a/auto_gptq/utils/import_utils.py
+++ b/auto_gptq/utils/import_utils.py
@ -7,15 +7,22 @@ try:
 except ImportError:
    TRITON_AVAILABLE = False
 try:
    import autogptq_cuda
    AUTOGPTQ_CUDA_AVAILABLE = True
 except:
    AUTOGPTQ_CUDA_AVAILABLE = False
 def dynamically_import_QuantLinear(use_triton: bool, desc_act: bool, group_size: int):
    if use_triton:
-        from ..nn_modules.qlinear_triton import QuantLinear
+        from ..nn_modules.qlinear.qlinear_triton import QuantLinear
    else:
        if not desc_act or group_size == -1:
-            from ..nn_modules.qlinear_old import QuantLinear
+            from ..nn_modules.qlinear.qlinear_cuda_old import QuantLinear
        else:
-            from ..nn_modules.qlinear import QuantLinear
+            from ..nn_modules.qlinear.qlinear_cuda import QuantLinear
    return QuantLinear
--- a/auto_gptq/utils/peft_utils.py
+++ b/auto_gptq/utils/peft_utils.py
@ -0,0 +1,423 @@
 import warnings
 import re
 from contextlib import contextmanager
 from dataclasses import asdict
 from enum import Enum
 from typing import List, Optional
 import torch
 from peft import get_peft_model, PeftConfig, PeftModel, PeftType
 from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING
 from peft.tuners.lora import LoraConfig, LoraLayer, LoraModel, Embedding
 from peft.tuners.adalora import AdaLoraConfig, AdaLoraLayer, AdaLoraModel
 from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING
 from peft.utils.other import _get_submodules
 from ..modeling._base import BaseGPTQForCausalLM
 class GPTQLoraConfig(LoraConfig):
    injected_fused_attention: bool = False
    injected_fused_mlp: bool = False
 class GPTQLoraLinear(torch.nn.Linear, LoraLayer):
    def __init__(
        self,
        adapter_name: str,
        linear_module: torch.nn.Linear,
        r: int = 0,
        lora_alpha: int = 1,
        lora_dropout: float = 0.0,
        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
        **kwargs,
    ):
        init_lora_weights = kwargs.pop("init_lora_weights", True)
        torch.nn.Linear.__init__(self, linear_module.in_features, linear_module.out_features)
        LoraLayer.__init__(self, linear_module.in_features, linear_module.out_features)
        self.linear_module = linear_module
        self.weight.requires_grad = False
        self.weight = self.linear_module.weight
        self.bias = self.linear_module.bias
        self.fan_in_fan_out = fan_in_fan_out
        if fan_in_fan_out:
            self.weight.data = self.weight.data.T
        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
        self.active_adapter = adapter_name
    def reset_lora_parameters(self, adapter_name):
        if adapter_name in self.lora_A.keys():
            torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight)
            torch.nn.init.zeros_(self.lora_B[adapter_name].weight)
    def merge(self):
        raise NotImplementedError("gptq model not support merge lora adapter")
    def unmerge(self):
        raise NotImplementedError("gptq model not support unmerge lora adapter")
    def forward(self, x: torch.Tensor):
        previous_dtype = x.dtype
        if self.active_adapter not in self.lora_A.keys():
            return self.linear_module(x)
        if self.disable_adapters:
            if self.r[self.active_adapter] > 0 and self.merged:
                self.unmerge()
            result = self.linear_module(x)
        elif self.r[self.active_adapter] > 0 and not self.merged:
            result = self.linear_module(x)
            lora_B = self.lora_B[self.active_adapter]
            lora_A = self.lora_A[self.active_adapter]
            lora_dropout = self.lora_dropout[self.active_adapter]
            scale = self.scaling[self.active_adapter]
            x = x.type_as(lora_A.weight.data)
            adapter_result = (lora_B(lora_A(lora_dropout(x))) * scale).type_as(result)
            result += adapter_result
        else:
            result = self.linear_module(x)
        result = result.to(previous_dtype)
        return result
 class GPTQLoraModel(LoraModel):
    def _find_and_replace(self, adapter_name):
        lora_config = self.peft_config[adapter_name]
        is_target_modules_in_base_model = False
        kwargs = {
            "r": lora_config.r,
            "lora_alpha": lora_config.lora_alpha,
            "lora_dropout": lora_config.lora_dropout,
            "fan_in_fan_out": lora_config.fan_in_fan_out,
            "init_lora_weights": lora_config.init_lora_weights,
        }
        key_list = [key for key, _ in self.model.named_modules()]
        for key in key_list:
            if isinstance(lora_config.target_modules, str):
                target_module_found = re.fullmatch(lora_config.target_modules, key)
            else:
                target_module_found = any(key.endswith(target_key) for target_key in lora_config.target_modules)
            if target_module_found:
                if not is_target_modules_in_base_model:
                    is_target_modules_in_base_model = True
                parent, target, target_name = _get_submodules(self.model, key)
                bias = False
                if hasattr(target, "bias"):
                    bias = target.bias is not None
                if isinstance(target, LoraLayer):
                    target.update_layer(
                        adapter_name,
                        lora_config.r,
                        lora_config.lora_alpha,
                        lora_config.lora_dropout,
                        lora_config.init_lora_weights,
                    )
                else:
                    if isinstance(target, torch.nn.Embedding):
                        embedding_kwargs = kwargs.copy()
                        embedding_kwargs.pop("fan_in_fan_out", None)
                        in_features, out_features = target.num_embeddings, target.embedding_dim
                        new_module = Embedding(adapter_name, in_features, out_features, **embedding_kwargs)
                    else:
                        if isinstance(target, torch.nn.Linear):
                            if kwargs["fan_in_fan_out"]:
                                warnings.warn(
                                    "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
                                    "Setting fan_in_fan_out to False."
                                )
                                kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
                        else:
                            raise ValueError(
                                f"Target module {target} is not supported. "
                                f"Currently, only `torch.nn.Linear` and its subclasses are supported."
                            )
                        new_module = GPTQLoraLinear(adapter_name, target, **kwargs)
                    self._replace_module(parent, target_name, new_module, target)
        if not is_target_modules_in_base_model:
            raise ValueError(
                f"Target modules {lora_config.target_modules} not found in the base model. "
                f"Please check the target modules and try again."
            )
    def _replace_module(self, parent_module, child_name, new_module, old_module):
        setattr(parent_module, child_name, new_module)
        if not isinstance(new_module, GPTQLoraLinear):
            new_module.weight = old_module.weight
            if hasattr(old_module, "bias"):
                if old_module.bias is not None:
                    new_module.bias = old_module.bias
            if getattr(old_module, "state", None) is not None:
                new_module.state = old_module.state
                new_module.to(old_module.weight.device)
        # dispatch to correct device
        for name, module in new_module.named_modules():
            if "lora_" in name:
                module.to(old_module.weight.device)
    def merge_adapter(self):
        raise NotImplementedError("gptq model not support merge ada lora adapter")
    def unmerge_adapter(self):
        raise NotImplementedError("gptq model not support unmerge ada lora adapter")
    def merge_and_unload(self):
        raise NotImplementedError("gptq model not support merge and unload")
 class GPTQAdaLoraConfig(AdaLoraConfig):
    injected_fused_attention: bool = False
    injected_fused_mlp: bool = False
 class GPTQSVDLinear(torch.nn.Linear, AdaLoraLayer):
    def __init__(
        self,
        adapter_name: str,
        linear_module: torch.nn.Linear,
        r: int = 0,
        lora_alpha: int = 1,
        lora_dropout: float = 0.0,
        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
        **kwargs,
    ):
        init_lora_weights = kwargs.pop("init_lora_weights", True)
        torch.nn.Linear.__init__(self, linear_module.in_features, linear_module.out_features)
        AdaLoraLayer.__init__(self, linear_module.in_features, linear_module.out_features)
        self.linear_module = linear_module
        self.weight.requires_grad = False
        self.weight = self.linear_module.weight
        self.bias = self.linear_module.bias
        self.fan_in_fan_out = fan_in_fan_out
        if fan_in_fan_out:
            self.weight.data = self.weight.data.T
        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
        self.active_adapter = adapter_name
    def merge(self):
        raise NotImplementedError("gptq model not support merge lora adapter")
    def unmerge(self):
        raise NotImplementedError("gptq model not support unmerge lora adapter")
    def forward(self, x: torch.Tensor):
        if self.active_adapter not in self.lora_A.keys():
            return self.linear_module(x)
        if self.disable_adapters:
            if self.r[self.active_adapter] > 0 and self.merged:
                self.unmerge()
            result = self.linear_module(x)
        elif self.r[self.active_adapter] > 0 and not self.merged:
            result = self.linear_module(x)
            result += (
                (
                    self.lora_dropout[self.active_adapter](x)
                    @ (self.lora_A[self.active_adapter] * self.lora_E[self.active_adapter]).T
                    @ self.lora_B[self.active_adapter].T
                )
                * self.scaling[self.active_adapter]
                / (self.ranknum[self.active_adapter] + 1e-5)
            )
        else:
            result = self.linear_module(x)
        return result
 class GPTQAdaLoraModel(AdaLoraModel):
    def _find_and_replace(self, adapter_name):
        lora_config = self.peft_config[adapter_name]
        is_target_modules_in_base_model = False
        kwargs = {
            "r": lora_config.init_r,
            "lora_alpha": lora_config.lora_alpha,
            "lora_dropout": lora_config.lora_dropout,
            "fan_in_fan_out": lora_config.fan_in_fan_out,
            "init_lora_weights": lora_config.init_lora_weights,
        }
        key_list = [key for key, _ in self.model.named_modules()]
        for key in key_list:
            if isinstance(lora_config.target_modules, str):
                target_module_found = re.fullmatch(lora_config.target_modules, key)
            else:
                target_module_found = any(key.endswith(target_key) for target_key in lora_config.target_modules)
            if target_module_found:
                if not is_target_modules_in_base_model:
                    is_target_modules_in_base_model = True
                parent, target, target_name = _get_submodules(self.model, key)
                bias = target.bias is not None
                if isinstance(target, LoraLayer):
                    target.update_layer(
                        adapter_name,
                        lora_config.init_r,
                        lora_config.lora_alpha,
                        lora_config.lora_dropout,
                        lora_config.init_lora_weights,
                    )
                else:
                    if isinstance(target, torch.nn.Linear):
                        in_features, out_features = target.in_features, target.out_features
                        if kwargs["fan_in_fan_out"]:
                            warnings.warn(
                                "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
                                "Setting fan_in_fan_out to False."
                            )
                            kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
                    else:
                        raise ValueError(
                            f"Target module {target} is not supported. "
                            f"Currently, only `torch.nn.Linear` and its subclasses are supported."
                        )
                    new_module = GPTQSVDLinear(adapter_name, target, **kwargs)
                    self._replace_module(parent, target_name, new_module, target)
        if not is_target_modules_in_base_model:
            raise ValueError(
                f"Target modules {lora_config.target_modules} not found in the base model. "
                f"Please check the target modules and try again."
            )
    def _replace_module(self, parent_module, child_name, new_module, old_module):
        setattr(parent_module, child_name, new_module)
        # dispatch to correct device
        for name, module in new_module.named_modules():
            if "lora_" in name:
                module.to(old_module.weight.device)
    def merge_adapter(self):
        raise NotImplementedError("gptq model not support merge ada lora adapter")
    def unmerge_adapter(self):
        raise NotImplementedError("gptq model not support unmerge ada lora adapter")
    def merge_and_unload(self):
        raise NotImplementedError("gptq model not support merge and unload")
 def find_all_linear_names(model: BaseGPTQForCausalLM, ignore: Optional[List[str]] = None, ignore_lm_head: bool = True):
    if not ignore:
        ignore = []
    lm_head_name = model.lm_head_name
    if ignore_lm_head and lm_head_name not in ignore:
        ignore.append(lm_head_name)
    results = set()
    for n, m in model.named_modules():
        if isinstance(m, torch.nn.Linear):
            res = n.split('.')[-1]
            if res not in ignore:
                results.add(res)
    return list(results)
@contextmanager
 def hijack_peft_mappings():
    PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
    PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
    PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
    PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
    try:
        yield
    except:
        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
        raise
    finally:
        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
 def get_gptq_peft_model(
    model: BaseGPTQForCausalLM,
    peft_config: PeftConfig = None,
    model_id: str = None,
    adapter_name: str = "default",
    auto_find_all_linears: bool = True,
    train_mode: bool = False
 ):
    if train_mode and not model.trainable:
        model.enable_trainable_mode()
    if train_mode and not peft_config:
        raise ValueError("peft_config not specified when in train mode.")
    if not train_mode and not model_id:
        raise ValueError("model_id(where to load adapters) not specified when in inference mode.")
    if model.fused_attn_module_type is not None and not model.injected_fused_attention:
        peft_types = [PeftType.LORA.value, PeftType.ADALORA.value]
        warnings.warn(
            f"You can just ignore this warning if the peft type you use isn't in {peft_types}.\n"
            f"{model.__class__.__name__} supports injecting fused attention but not enables this time. "
            "If you are training adapters, you must also disable fused attention injection when loading quantized "
            "base model at inference time, otherwise adapters may not be added to base model properly. "
            "If you are loading adapters to do inference, you can reference to adapter's config file to check "
            "whether the adapters are trained using base model that not enable fused attention injection."
        )
    if model.injected_fused_mlp:
        raise NotImplementedError("GPTQ model that enables fused mlp injection is not supported to integrate with peft.")
    if train_mode:
        peft_type = peft_config.peft_type
        if not isinstance(peft_type, str):
            peft_type = peft_type.value
        if peft_type in [PeftType.LORA.value, PeftType.ADALORA.value]:
            if auto_find_all_linears:
                peft_config.target_modules = find_all_linear_names(model, ignore_lm_head=True)
            if peft_type == PeftType.LORA.value and not isinstance(peft_config, GPTQLoraConfig):
                peft_config = GPTQLoraConfig(**peft_config.to_dict())
            if peft_type == PeftType.ADALORA.value and not isinstance(peft_config, GPTQAdaLoraConfig):
                peft_config = GPTQAdaLoraConfig(**peft_config.to_dict())
            peft_config.injected_fused_attention = model.injected_fused_attention
            peft_config.injected_fused_mlp = model.injected_fused_mlp
        if peft_type == PeftType.ADAPTION_PROMPT.value:
            if peft_config.adapter_layers > model.config.num_hidden_layers:
                warnings.warn(
                    f"model has only {model.config.num_hidden_layers} layers "
                    f"but adapter_layers is set to {peft_config.adapter_layers}, "
                    f"will reset value to {model.config.num_hidden_layers}."
                )
                peft_config.adapter_layers = model.config.num_hidden_layers
            if model.injected_fused_attention:
                raise NotImplementedError(
                    "model with fused attention injected isn't supported to use ADAPTION_PROMPT peft type yet."
                )
    with hijack_peft_mappings():
        try:
            if train_mode:
                peft_model = get_peft_model(model.model, peft_config)
            else:
                peft_model = PeftModel.from_pretrained(model.model, model_id, adapter_name)
        except:
            raise NotImplementedError(
                f"{model.__class__.__name__} not support {peft_config.peft_type.value} peft type yet."
            )
    return peft_model
 __all__ = [
    "GPTQLoraConfig",
    "GPTQLoraModel",
    "GPTQAdaLoraConfig",
    "GPTQAdaLoraModel",
    "find_all_linear_names",
    "get_gptq_peft_model"
 ]
--- a/auto_gptq/utils/perplexity_utils.py
+++ b/auto_gptq/utils/perplexity_utils.py
@ -0,0 +1,215 @@
 import sys
 import torch
 import numpy as np
 from tqdm import tqdm
 from datasets import load_dataset
 from transformers import AutoTokenizer, AutoModelForCausalLM
 class Perplexity:
    """
    A class for calculating the perplexity of a language model.
    """
    def __init__(self, model, tokenizer, dataset_path='wikitext', dataset_name=None, split='test', text_column='text'):
        """
        Calculate perplexity using the same method as seen in llama.cpp.
        Parameters
        ----------
        model : AutoModelForCausalLM
            The language model for which the perplexity is calculated.
        tokenizer : AutoTokenizer
            The tokenizer corresponding to the model.
        device : str, optional
            The device to run the calculations on. If auto, the device that your model uses
            will be the device used for these calculations. Default is 'auto'.
        dataset_path : str, optional
            The path to the dataset on the Hugging Face dataset hub. Default is 'wikitext'.
        dataset_name : str, optional
            The name of the dataset. Default is None.
        split : str, optional
            The split of the dataset to use. Default is 'test'.
        text_column : str, optional
            The name of the column in the dataset that contains the text data. Default is 'text'.
        """
        self._model = model
        self._tokenizer = tokenizer
        self._dataset_path = dataset_path
        self._dataset_name = dataset_name
        self._split = split
        self._text_column = text_column
        self._text = self._prepare_data()
    def _get_device(self):
        if torch.backends.mps.is_available():
            return 'mps'
        elif torch.cuda.is_available():
            return 'cuda:0'
        else:
            return 'cpu'
    def _prepare_data(self):
        """
        Prepares the dataset by loading and formatting.
        Returns
        -------
        str
            The formatted dataset as a single string.
        """
        if self._dataset_path == 'wikitext':
            self._dataset_name = 'wikitext-2-raw-v1'
        # Load the dataset
        data = load_dataset(self._dataset_path, self._dataset_name, split=self._split)
        # Format the text column of the dataset
        text_list = [' \n' if s == '' else s for s in data[self._text_column]]
        return ''.join(text_list)
    @staticmethod
    def softmax(logits):
        """
        Static method for applying the softmax function.
        Parameters
        ----------
        logits : np.ndarray
            The input to the softmax function.
        Returns
        -------
        np.ndarray
            The output of the softmax function.
        """
        e_x = np.exp(logits - np.max(logits))
        return e_x / e_x.sum(axis=0)
    def calculate_perplexity(self, n_ctx=512, n_batch=512):
        """
        Calculates the perplexity of the language model.
        Parameters
        ----------
        n_ctx : int
            The context size.
        n_batch : int
            The batch size.
        Returns
        -------
        list
            The list of perplexity scores calculated.
        """
        # Tokenize the text
        self._tokenizer.model_max_length = sys.maxsize
        tokens = self._tokenizer(self._text, truncation=False, return_tensors='pt').input_ids.to(self._model.device)
        nll = 0.0  # Negative log likelihood
        count = 0  # Counter for processed tokens
        curr_ppl = 0
        all_perplexity = []
        with tqdm(range(len(tokens[0]) // n_ctx), desc="Perplexity: - ") as progress:
            for i in progress:
                # Process each batch of tokens
                nll, count = self._process_batch(i, n_ctx, n_batch, tokens, nll, count)
                # Calculate and display the current perplexity
                curr_ppl = np.exp(nll / count)
                all_perplexity.append(curr_ppl)
                progress.set_description(f"Perplexity: {curr_ppl:.4f}")
        return all_perplexity
    def _process_batch(self, i, n_ctx, n_batch, tokens, nll, count):
        """
        Processes each batch of tokens.
        Parameters
        ----------
        i : int
            The batch index.
        n_ctx : int
            The context size.
        n_batch : int
            The batch size.
        tokens : torch.Tensor
            The tokenized text.
        nll : float
            The current negative log likelihood.
        count : int
            The current count of processed tokens.
        Returns
        -------
        float
            The updated negative log likelihood.
        int
            The updated count of processed tokens.
        """
        start = i * n_ctx
        end = start + n_ctx
        num_batches = (n_ctx + n_batch - 1) // n_batch
        logits = []
        for j in range(num_batches):
            batch_start = start + j * n_batch
            batch_size = min(end - batch_start, n_batch)
            token_org = tokens[0][batch_start].item()
            if j == 0:
                # Replace the first token with the BOS token
                tokens[0][batch_start] = self._tokenizer.bos_token_id
            # Compute the logits for the current batch of tokens
            batch_logits = self._compute_batch_logits(tokens, batch_start, batch_size)
            tokens[0][batch_start] = token_org
            logits.append(batch_logits)
        # We rely on the fact that attention in the forward pass only looks at previous
        # tokens here, so the logits returned for each token are an accurate representation
        # of what the model would have predicted at that point.
        # 
        # Example, we have a context window of 512, we will compute perplexity for each of the
        # last 256 tokens.  Then, we split the input up into context window size chunks to
        # process the entire prompt.
        for j in range(min(512, n_ctx // 2), n_ctx - 1):
            tok_logits = logits[0][0][j].cpu().numpy()
            # Compute the probability of the next token
            prob = self.softmax(tok_logits)[tokens[0][start + j + 1]]
            # Update the negative log likelihood and the count of processed tokens
            nll += -np.log(prob, where=prob>0)
            count += 1
        return nll, count
    def _compute_batch_logits(self, tokens, batch_start, batch_size):
        """
        Computes the logits for a batch of tokens.
        Parameters
        ----------
        tokens : torch.Tensor
            The tokenized text.
        batch_start : int
            The start index of the batch.
        batch_size : int
            The size of the batch.
        Returns
        -------
        torch.Tensor
            The logits for the batch of tokens.
        """
        # Compute the logits without keeping track of gradients
        with torch.no_grad():
            outputs = self._model(tokens[:, batch_start:batch_start+batch_size])
        return outputs.logits.detach()
--- a/autogptq_cuda/autogptq_cuda_256.cpp
+++ b/autogptq_cuda/autogptq_cuda_256.cpp
@ -172,16 +172,16 @@ void vecquant4matmul_faster_old(
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
-  m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
-  m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
-  m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
  m.def("vecquant2matmul_old", &vecquant2matmul_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
  m.def("vecquant3matmul_old", &vecquant3matmul_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
  m.def("vecquant4matmul_old", &vecquant4matmul_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
  m.def("vecquant8matmul_old", &vecquant8matmul_old, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
-  m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
+  m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA), faster version");
  m.def("vecquant3matmul_faster_old", &vecquant3matmul_faster_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA), faster version");
  m.def("vecquant4matmul_faster_old", &vecquant4matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
 }
--- a/autogptq_cuda/autogptq_cuda_64.cpp
+++ b/autogptq_cuda/autogptq_cuda_64.cpp
@ -0,0 +1,187 @@
 #include <torch/all.h>
 #include <torch/python.h>
 #include <c10/cuda/CUDAGuard.h>
 void vecquant2matmul_cuda(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  torch::Tensor g_idx
 );
 void vecquant2matmul(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  torch::Tensor g_idx
 ) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
  vecquant2matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
 }
 void vecquant3matmul_cuda(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  torch::Tensor g_idx
 );
 void vecquant3matmul(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  torch::Tensor g_idx
 ) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
  vecquant3matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
 }
 void vecquant4matmul_cuda(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  torch::Tensor g_idx
 );
 void vecquant4matmul(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  torch::Tensor g_idx
 ) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
  vecquant4matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
 }
 void vecquant8matmul_cuda(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  torch::Tensor g_idx
 );
 void vecquant8matmul(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  torch::Tensor g_idx
 ) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
  vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
 }
 // old
 void vecquant2matmul_cuda_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize
 ); 
 void vecquant2matmul_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize
 ) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
  vecquant2matmul_cuda_old(vec, mat, mul, scales, zeros,groupsize);
 }
 void vecquant3matmul_cuda_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize
 ); 
 void vecquant3matmul_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize
 ) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
  vecquant3matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
 }
 void vecquant4matmul_cuda_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize
 ); 
 void vecquant4matmul_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize
 ) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
  vecquant4matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
 }
 void vecquant8matmul_cuda_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize
 ); 
 void vecquant8matmul_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize
 ) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
  vecquant8matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
 }
 void vecquant2matmul_faster_cuda_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize, int vec_height
 ); 
 void vecquant2matmul_faster_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize, int vec_height
 ) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
  vecquant2matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
 }
 void vecquant3matmul_faster_cuda_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize, int vec_height
 ); 
 void vecquant3matmul_faster_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize, int vec_height
 ) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
  vecquant3matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
 }
 void vecquant4matmul_faster_cuda_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize, int vec_height
 ); 
 void vecquant4matmul_faster_old(
  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
  torch::Tensor scales, torch::Tensor zeros,
  int groupsize, int vec_height
 ) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
  vecquant4matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
 }
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
  m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
  m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
  m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
  m.def("vecquant2matmul_old", &vecquant2matmul_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
  m.def("vecquant3matmul_old", &vecquant3matmul_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
  m.def("vecquant4matmul_old", &vecquant4matmul_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
  m.def("vecquant8matmul_old", &vecquant8matmul_old, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
  m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA), faster version");
  m.def("vecquant3matmul_faster_old", &vecquant3matmul_faster_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA), faster version");
  m.def("vecquant4matmul_faster_old", &vecquant4matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
 }
--- a/autogptq_cuda/autogptq_cuda_kernel_256.cu
+++ b/autogptq_cuda/autogptq_cuda_kernel_256.cu
@ -7,29 +7,66 @@
 // atomicAdd for double-precision floating-point numbers on hardware with
 // compute capability < 6.0 from:
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-__device__ double atomicAdd(
+// __device__ double atomicAdd(
-    double* address,
+//     double* address,
-    double val
+//     double val
-) {
+// ) {
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+//   unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old = *address_as_ull, assumed;
+//   unsigned long long int old = *address_as_ull, assumed;
 //
 //   do {
 //     assumed = old;
 //     old = atomicCAS(
 //       address_as_ull,
 //       assumed,
 //       __double_as_longlong(val + __longlong_as_double(assumed))
 //     );
 //
 //   // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
 //   } while (assumed != old);
 //
 //   return __longlong_as_double(old);
 // }
 // #endif
-  do {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
-    assumed = old;
+// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
-    old = atomicCAS(
+__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
-      address_as_ull,
+    unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));
-      assumed,
+    unsigned int old = *address_as_ui;
-      __double_as_longlong(val + __longlong_as_double(assumed))
+    unsigned int assumed;
    );
-  // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    do {
-  } while (assumed != old);
+        assumed = old;
        unsigned short hsum = reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
        hsum += val;
        old = reinterpret_cast<size_t>(address) & 2
                 ? (old & 0xffff) | (hsum << 16)
                 : (old & 0xffff0000) | hsum;
        old = atomicCAS(address_as_ui, assumed, old);
-  return __longlong_as_double(old);
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
    } while (assumed != old);
 }
 __device__ __forceinline__ void atomicAdd(__half* address, c10::Half val) {
    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
    unsigned int old = *address_as_ui;
    unsigned int assumed;
    do {
        assumed = old;
        __half_raw hsum;
        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
        half tmpres = __hadd(hsum, val);
        hsum = __half_raw(tmpres);
        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
        old = atomicCAS(address_as_ui, assumed, old);
    } while (assumed != old);
 }
 #endif
 template <typename scalar_t>
 __global__ void VecQuant2MatMulKernel(
    const  scalar_t* __restrict__ vec,
@ -69,7 +106,7 @@ __global__ void VecQuant4MatMulKernel(
    const  		int* __restrict__ zeros,
 	const  	    int* __restrict__ g_idx,
    int batch,
-    int vec_height, 	
+    int vec_height,
    int height,
    int width,
 	int zero_width
--- a/autogptq_cuda/autogptq_cuda_kernel_64.cu
+++ b/autogptq_cuda/autogptq_cuda_kernel_64.cu
--- a/docs/NEWS_OR_UPDATE.md
+++ b/docs/NEWS_OR_UPDATE.md
@ -1,4 +1,6 @@
 ## <center>News or Update</center>
 - 2023-07-26 - (Update) - An elegant [PPL benchmark script](examples/benchmark/perplexity.py) to get results that can be fairly compared with other libraries such as `llama.cpp`.
 - 2023-06-05 - (Update) - Integrate with 🤗 peft to use gptq quantized model to train adapters, support LoRA, AdaLoRA, AdaptionPrompt, etc.
 - 2023-05-30 - (Update) - support download/upload quantized model from/to 🤗 Hub.
 - 2023-05-27 - (Update) - Support quantization and inference for `gpt_bigcode`, `codegen` and `RefineWeb/RefineWebModel`(falcon) model types.
 - 2023-05-04 - (Update) - Support using faster cuda kernel when `not desc_act or group_size == -1`
--- a/examples/README.md
+++ b/examples/README.md
@ -13,9 +13,9 @@ python basic_usage.py
 This script also showcases how to download/upload quantized model from/to 🤗 Hub, to enable those features, you can uncomment the commented codes.
-To Execute `basic_usage_with_wikitext2.py`, using command like this:
+To Execute `basic_usage_wikitext2.py`, using command like this:
 ```shell
-python basic_usage_with_wikitext2.py
+python basic_usage_wikitext2.py
 ```
 > Note: There is about 0.6 ppl degrade on opt-125m model using AutoGPTQ, compared to GPTQ-for-LLaMa.
@ -66,11 +66,48 @@ Use `--help` flag to see detailed descriptions for more command arguments.
 > Commands in this chapter should be run under `benchmark` folder.
 ### Generation Speed
-`generation_speed.py` scripts gives an example of how to benchmark the generations speed of pretrained and quantized models that `auto_gptq` supports, this benchmarks model generation speed in tokens/s metric.
+`generation_speed.py` script gives an example of how to benchmark the generations speed of pretrained and quantized models that `auto_gptq` supports, this benchmarks model generation speed in tokens/s metric.
-To eexcute this script, using command like this:
+To execute this script, using command like this:
 ```shell
 CUDA_VISIBLE_DEVICES=0 python generation_speed.py --model_name_pr_path PATH/TO/MODEL/DIR
 ```
-Use `--help` flag to see detailed descriptions for more command arguments.
+Use `--help` flag to see detailed descriptions for more command arguments.
 ## PEFT
 > Commands in this chapter should be run under `peft` folder.
 ### Lora
 `peft_lora_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's lora adapter using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
 To execute this script, using command like this:
 ```shell
 CUDA_VISIBLE_DEVICES=0 python peft_lora_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
 ```
 Use `--help` flag to see detailed descriptions for more command arguments.
 ### AdaLora
 `peft_adalora_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's adalora adapter using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
 To execute this script, using command like this:
 ```shell
 CUDA_VISIBLE_DEVICES=0 python peft_adalora_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
 ```
 Use `--help` flag to see detailed descriptions for more command arguments.
 ### AdaptionPrompt
 `peft_adaption_prompt_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's adaption_prompt adapter(llama-adapter) using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
 To execute this script, using command like this:
 ```shell
 CUDA_VISIBLE_DEVICES=0 python peft_adaption_prompt_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
 ```
 Use `--help` flag to see detailed descriptions for more command arguments.
 If you want to try models other than llama, you can install peft from source using [this branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt), see [here](https://github.com/PanQiWei/peft/blob/a5f8f74f07591efe5eb3d08cb1b31b981e84a069/src/peft/tuners/adaption_prompt.py#L235) 
 to check what other models are also supported, and with this branch installed, you can also use `ADAPTION_PROMPT_V2` peft type (llama-adapter-v2) by simply replace `AdaptionPromptConfig` with `AdaptionPromptV2Config` in the script.
--- a/examples/benchmark/dataset/alpaca_data_cleaned.json
+++ b/examples/benchmark/dataset/alpaca_data_cleaned.json
--- a/examples/benchmark/generation_speed.py
+++ b/examples/benchmark/generation_speed.py
@ -144,7 +144,9 @@ def load_model_tokenizer(
    trust_remote_code: bool = False,
    use_triton: bool = False,
    use_safetensors: bool = False,
-    use_fast_tokenizer: bool = False
+    use_fast_tokenizer: bool = False,
    inject_fused_attention: bool = True,
    inject_fused_mlp: bool = True
 ):
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path=tokenizer_name_or_path or model_name_or_path,
@ -163,12 +165,12 @@ def load_model_tokenizer(
        )
    else:
        model = AutoGPTQForCausalLM.from_quantized(
-            save_dir=model_name_or_path,
+            model_name_or_path,
            max_memory=max_memory,
            low_cpu_mem_usage=True,
            use_triton=use_triton,
-            inject_fused_attention=True,
+            inject_fused_attention=inject_fused_attention,
-            inject_fused_mlp=True,
+            inject_fused_mlp=inject_fused_mlp,
            use_cuda_fp16=True,
            quantize_config=quantize_config,
            model_basename=model_basename,
@ -232,6 +234,8 @@ def main():
    parser.add_argument("--use_triton", action="store_true")
    parser.add_argument("--use_safetensors", action="store_true")
    parser.add_argument("--use_fast_tokenizer", action="store_true")
    parser.add_argument("--no_inject_fused_attention", action="store_true")
    parser.add_argument("--no_inject_fused_mlp", action="store_true")
    parser.add_argument("--num_samples", type=int, default=10)
    parser.add_argument("--per_gpu_max_memory", type=int, default=None)
    parser.add_argument("--cpu_max_memory", type=int, default=None)
@ -269,7 +273,9 @@ def main():
        trust_remote_code=args.trust_remote_code,
        use_triton=args.use_triton,
        use_safetensors=args.use_safetensors,
-        use_fast_tokenizer=args.use_fast_tokenizer
+        use_fast_tokenizer=args.use_fast_tokenizer,
        inject_fused_attention=not args.no_inject_fused_attention,
        inject_fused_mlp=not args.no_inject_fused_mlp
    )
    end = time.time()
    logger.info(f"model and tokenizer loading time: {end - start:.4f}s")
@ -282,7 +288,9 @@ def main():
        model.warmup_triton()
    logger.info("loading data")
-    examples = load_data("dataset/alpaca_data_cleaned.json", tokenizer, args.num_samples, args.max_new_tokens)
+    examples = load_data(
        "../quantization/dataset/alpaca_data_cleaned.json", tokenizer, args.num_samples, args.max_new_tokens
    )
    generation_config = GenerationConfig(
        num_beams=args.num_beams,
--- a/examples/benchmark/perplexity.py
+++ b/examples/benchmark/perplexity.py
@ -0,0 +1,86 @@
 import os
 import argparse
 import torch
 from auto_gptq.utils import Perplexity
 from transformers import AutoTokenizer
 if __name__ == "__main__":
    """
    Example usage.
    Default usage with GPT2 model:
    python examples/benchmark/perplexity.py
    Specify GPTQ quantized model:
    python examples/benchmark/perplexity.py \
        --model_name TheBloke/open-llama-7b-open-instruct-GPTQ \
        --model_basename gptq_model-4bit-128g \
        --is_quantized
    Change your dataset:
    python examples/benchmark/perplexity.py --dataset_path tiny_shakespeare
    """
    parser = argparse.ArgumentParser(description="Calculate Perplexity for a model.")
    parser.add_argument("--model_name", type=str, default='gpt2', help="Model name.")
    parser.add_argument("--model_basename", type=str, default=None, help="Model file's basename.")
    parser.add_argument("--n_ctx", type=int, default=512, help="Context size.")
    parser.add_argument("--n_batch", type=int, default=512, help="Batch size.")
    parser.add_argument("--dataset_path", type=str, default='wikitext', help="Path to the dataset.")
    parser.add_argument("--dataset_name", type=str, default=None, help="Name of the dataset.")
    parser.add_argument("--split", type=str, default='test', help="Dataset split to use.")
    parser.add_argument("--text_column", type=str, default='text', help="Column in the dataset containing the text.")
    parser.add_argument("--per_gpu_max_memory", type=int, default=None, help="Max memory used in each GPU.")
    parser.add_argument("--cpu_max_memory", type=int, default=None, help="Mx memory used in CPU.")
    parser.add_argument("--is_quantized", action="store_true", help="Is the model GPTQ quantized?")
    parser.add_argument("--use_safetensors", action="store_true", help="Whether to use safetensors model file")
    parser.add_argument("--use_fast_tokenizer", action="store_true", help="Wheter to use fast tokenizer")
    parser.add_argument("--trust_remote_code", action="store_true", help="Whether to use remote code")
    args = parser.parse_args()
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=args.use_fast_tokenizer)
    if not tokenizer.pad_token_id:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    max_memory = dict()
    if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0:
        if torch.cuda.is_available():
            max_memory.update(
                {i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())}
            )
    if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory:
        max_memory["cpu"] = f"{args.cpu_max_memory}GIB"
    if not max_memory:
        max_memory = None
    if args.is_quantized:
        from auto_gptq import AutoGPTQForCausalLM
        model = AutoGPTQForCausalLM.from_quantized(
            args.model_name,
            low_cpu_mem_usage=True,
            device_map="auto",
            max_memory=max_memory,
            model_basename=args.model_basename,
            use_safetensors=args.use_safetensors,
            trust_remote_code=args.trust_remote_code,
            inject_fused_mlp=False,
            inject_fused_attention=False
        )
    else:
        from transformers import AutoModelForCausalLM
        model = AutoModelForCausalLM.from_pretrained(
            args.model_name,
            low_cpu_mem_usage=True,
            device_map="auto",
            max_memory=max_memory,
            torch_dtype=torch.float16,
            trust_remote_code=args.trust_remote_code
        )
    ppl = Perplexity(model, tokenizer, args.dataset_path, args.dataset_name, args.split, args.text_column)
    ppl.calculate_perplexity(args.n_ctx, args.n_batch)
--- a/examples/peft/peft_adalora_clm_instruction_tuning.py
+++ b/examples/peft/peft_adalora_clm_instruction_tuning.py
@ -0,0 +1,169 @@
 import json
 import os
 from argparse import ArgumentParser
 from functools import partial
 import torch
 from datasets import Dataset
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import AutoTokenizer, get_linear_schedule_with_warmup
 from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
 from auto_gptq.utils.data_utils import make_data_block, collate_data
 from auto_gptq.utils.peft_utils import GPTQAdaLoraConfig
 from peft import TaskType
 parser = ArgumentParser()
 parser.add_argument("--model_name_or_path", type=str)
 parser.add_argument("--lr", type=float, default=3e-3)
 parser.add_argument("--num_epochs", type=int, default=1)
 parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
 parser.add_argument("--block_max_length", type=int, default=1024, help="max length of data block(bunch of samples)")
 parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
 parser.add_argument("--use_fast_tokenizer", action="store_true")
 args = parser.parse_args()
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 model_name_or_path = args.model_name_or_path
 tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
 lr = args.lr
 num_epochs = args.num_epochs
 # creating model
 peft_config = GPTQAdaLoraConfig(
    init_r=20,
    target_r=16,
    beta1=0.85,
    beta2=0.85,
    tinit=200,
    tfinal=1000,
    deltaT=10,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
 )
 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
 if not tokenizer.pad_token_id:
    tokenizer.pad_token_id = tokenizer.eos_token_id
 model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    use_triton=True,
    warmup_triton=False,
    trainable=True,
    inject_fused_attention=True,
    inject_fused_mlp=False
 )
 model.warmup_triton()
 device = model.device
 model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
 model.print_trainable_parameters()
 # loading dataset
 WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
 WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
 def ds_refactor_fn(samples):
    instruction_data = samples["instruction"]
    input_data = samples["input"]
    output_data = samples["output"]
    new_samples = {"prompt": [], "output": []}
    for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
        if input_txt:
            prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
        else:
            prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
        new_samples["prompt"].append(prompt)
        new_samples["output"].append(output_txt)
    return new_samples
 ds = Dataset.from_generator(
    lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
 )
 ds = ds.map(
    make_data_block,
    batched=True,
    batch_size=len(ds),
    num_proc=1,
    remove_columns=ds.column_names,
    keep_in_memory=True,
    load_from_cache_file=False,
    fn_kwargs={
        "prompt_col_name": "prompt",
        "label_col_name": "output",
        "tokenizer": tokenizer,
        "preprocess_fn": ds_refactor_fn,
        "sample_max_len": args.sample_max_length,
        "block_max_len": args.block_max_length,
        "add_eos_token": True,
        "truncate_prompt": False,
        "merge_prompt_label": True
    }
 )
 ds = ds.train_test_split(test_size=len(ds) // 10)
 train_ds, eval_ds = ds["train"], ds["test"]
 collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
 train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
 eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
 # optimizer and lr scheduler
 optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
 lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
 )
 model.base_model.peft_config["default"].total_step = len(train_dataloader) * num_epochs
 # training and evaluation
 with torch.cuda.amp.autocast():
    global_step = 0
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_dataloader)
        for step, batch in enumerate(progress_bar):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.detach().float()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            # Update the importance of low-rank matrices
            # and allocate the budget accordingly.
            model.base_model.update_and_allocate(global_step)
            optimizer.zero_grad()
            global_step += 1
            progress_bar.set_postfix(loss=loss.item())
        model.eval()
        eval_loss = 0
        eval_preds = []
        for step, batch in enumerate(tqdm(eval_dataloader)):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            loss = outputs.loss
            eval_loss += loss.detach().float()
            eval_preds.extend(
                tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
            )
        eval_epoch_loss = eval_loss / len(eval_dataloader)
        eval_ppl = torch.exp(eval_epoch_loss)
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
 model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))
--- a/examples/peft/peft_adaption_prompt_clm_instruction_tuning.py
+++ b/examples/peft/peft_adaption_prompt_clm_instruction_tuning.py
@ -0,0 +1,158 @@
 import json
 import os
 from argparse import ArgumentParser
 from functools import partial
 import torch
 from datasets import Dataset
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import AutoTokenizer, get_linear_schedule_with_warmup
 from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
 from auto_gptq.utils.data_utils import make_data_block, collate_data
 from peft import TaskType, AdaptionPromptConfig
 parser = ArgumentParser()
 parser.add_argument("--model_name_or_path", type=str)
 parser.add_argument("--adapter_len", type=int, default=10)
 parser.add_argument("--adapter_layers", type=int, default=30)
 parser.add_argument("--lr", type=float, default=3e-3)
 parser.add_argument("--num_epochs", type=int, default=1)
 parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
 parser.add_argument("--block_max_length", type=int, default=1024, help="max length of data block(bunch of samples)")
 parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
 parser.add_argument("--use_fast_tokenizer", action="store_true")
 args = parser.parse_args()
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 model_name_or_path = args.model_name_or_path
 tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
 lr = args.lr
 num_epochs = args.num_epochs
 # creating model
 peft_config = AdaptionPromptConfig(
    adapter_len=args.adapter_len,
    adapter_layers=args.adapter_layers,
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
 )
 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
 if not tokenizer.pad_token_id:
    tokenizer.pad_token_id = tokenizer.eos_token_id
 model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    use_triton=True,
    warmup_triton=False,
    trainable=True,
    inject_fused_attention=False,
    inject_fused_mlp=False
 )
 model.warmup_triton()
 device = model.device
 model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
 model.print_trainable_parameters()
 # loading dataset
 WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
 WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
 def ds_refactor_fn(samples):
    instruction_data = samples["instruction"]
    input_data = samples["input"]
    output_data = samples["output"]
    new_samples = {"prompt": [], "output": []}
    for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
        if input_txt:
            prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
        else:
            prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
        new_samples["prompt"].append(prompt)
        new_samples["output"].append(output_txt)
    return new_samples
 ds = Dataset.from_generator(
    lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
 )
 ds = ds.map(
    make_data_block,
    batched=True,
    batch_size=len(ds),
    num_proc=1,
    remove_columns=ds.column_names,
    keep_in_memory=True,
    load_from_cache_file=False,
    fn_kwargs={
        "prompt_col_name": "prompt",
        "label_col_name": "output",
        "tokenizer": tokenizer,
        "preprocess_fn": ds_refactor_fn,
        "sample_max_len": args.sample_max_length,
        "block_max_len": args.block_max_length,
        "add_eos_token": True,
        "truncate_prompt": False,
        "merge_prompt_label": True
    }
 )
 ds = ds.train_test_split(test_size=len(ds) // 10)
 train_ds, eval_ds = ds["train"], ds["test"]
 collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
 train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
 eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
 # optimizer and lr scheduler
 optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
 lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
 )
 # training and evaluation
 with torch.cuda.amp.autocast():
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_dataloader)
        for step, batch in enumerate(progress_bar):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.detach().float()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.set_postfix(loss=loss.item())
        model.eval()
        eval_loss = 0
        eval_preds = []
        for step, batch in enumerate(tqdm(eval_dataloader)):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            loss = outputs.loss
            eval_loss += loss.detach().float()
            eval_preds.extend(
                tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
            )
        eval_epoch_loss = eval_loss / len(eval_dataloader)
        eval_ppl = torch.exp(eval_epoch_loss)
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
 model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))
--- a/examples/peft/peft_lora_clm_instruction_tuning.py
+++ b/examples/peft/peft_lora_clm_instruction_tuning.py
@ -0,0 +1,158 @@
 import json
 import os
 from argparse import ArgumentParser
 from functools import partial
 import torch
 from datasets import Dataset
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import AutoTokenizer, get_linear_schedule_with_warmup
 from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
 from auto_gptq.utils.data_utils import make_data_block, collate_data
 from auto_gptq.utils.peft_utils import GPTQLoraConfig
 from peft import TaskType
 parser = ArgumentParser()
 parser.add_argument("--model_name_or_path", type=str)
 parser.add_argument("--lr", type=float, default=3e-5)
 parser.add_argument("--num_epochs", type=int, default=1)
 parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
 parser.add_argument("--block_max_length", type=int, default=1024, help="max length of data block(bunch of samples)")
 parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
 parser.add_argument("--use_fast_tokenizer", action="store_true")
 args = parser.parse_args()
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 model_name_or_path = args.model_name_or_path
 tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
 lr = args.lr
 num_epochs = args.num_epochs
 # creating model
 peft_config = GPTQLoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
 )
 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
 if not tokenizer.pad_token_id:
    tokenizer.pad_token_id = tokenizer.eos_token_id
 model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    use_triton=True,
    warmup_triton=False,
    trainable=True,
    inject_fused_attention=True,
    inject_fused_mlp=False
 )
 model.warmup_triton()
 device = model.device
 model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
 model.print_trainable_parameters()
 # loading dataset
 WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
 WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
 def ds_refactor_fn(samples):
    instruction_data = samples["instruction"]
    input_data = samples["input"]
    output_data = samples["output"]
    new_samples = {"prompt": [], "output": []}
    for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
        if input_txt:
            prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
        else:
            prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
        new_samples["prompt"].append(prompt)
        new_samples["output"].append(output_txt)
    return new_samples
 ds = Dataset.from_generator(
    lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
 )
 ds = ds.map(
    make_data_block,
    batched=True,
    batch_size=len(ds),
    num_proc=1,
    remove_columns=ds.column_names,
    keep_in_memory=True,
    load_from_cache_file=False,
    fn_kwargs={
        "prompt_col_name": "prompt",
        "label_col_name": "output",
        "tokenizer": tokenizer,
        "preprocess_fn": ds_refactor_fn,
        "sample_max_len": args.sample_max_length,
        "block_max_len": args.block_max_length,
        "add_eos_token": True,
        "truncate_prompt": False,
        "merge_prompt_label": True
    }
 )
 ds = ds.train_test_split(test_size=len(ds) // 10)
 train_ds, eval_ds = ds["train"], ds["test"]
 collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
 train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
 eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
 # optimizer and lr scheduler
 optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
 lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
 )
 # training and evaluation
 with torch.cuda.amp.autocast():
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_dataloader)
        for step, batch in enumerate(progress_bar):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.detach().float()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.set_postfix(loss=loss.item())
        model.eval()
        eval_loss = 0
        eval_preds = []
        for step, batch in enumerate(tqdm(eval_dataloader)):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            loss = outputs.loss
            eval_loss += loss.detach().float()
            eval_preds.extend(
                tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
            )
        eval_epoch_loss = eval_loss / len(eval_dataloader)
        eval_ppl = torch.exp(eval_epoch_loss)
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
 model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))
--- a/setup.py
+++ b/setup.py
@ -4,29 +4,30 @@ import sys
 from pathlib import Path
 from setuptools import setup, find_packages
 try:
    import torch
    TORCH_AVAILABLE = True
 except ImportError:
    TORCH_AVAILABLE = False
 IN_GITHUB_ACTIONS = os.environ.get("GITHUB_ACTIONS", "false") == "true"
 python_min_version = (3, 8, 0)
 python_min_version_str = '.'.join(map(str, python_min_version))
 if sys.version_info < python_min_version:
    print(f"You are using Python {platform.python_version()}. Python >={python_min_version_str} is required.")
    sys.exit(-1)
-CUDA_VERSION = "".join(os.environ.get("CUDA_VERSION", "").split("."))
+BUILD_CUDA_EXT = int(os.environ.get('BUILD_CUDA_EXT', '1')) == 1
 if BUILD_CUDA_EXT:
    try:
        import torch
    except:
        print("torch is not installed, please install torch first!")
        sys.exit(-1)
    CUDA_VERSION = "".join(torch.version.cuda.split("."))
 else:
    CUDA_VERSION = "".join(os.environ.get("CUDA_VERSION", "").split("."))
 version = "0.2.1" + (f"+cu{CUDA_VERSION}" if CUDA_VERSION and IN_GITHUB_ACTIONS else "")
 common_setup_kwargs = {
-    "version": version,
+    "version": "0.3.2",
    "name": "auto_gptq",
    "author": "PanQiWei",
    "description": "An easy-to-use LLMs quantization package with user-friendly apis, based on GPTQ algorithm.",
-    "long_description": (Path(__file__).parent / "README.md").read_text(),
+    "long_description": (Path(__file__).parent / "README.md").read_text(encoding="UTF-8"),
    "long_description_content_type": "text/markdown",
    "url": "https://github.com/PanQiWei/AutoGPTQ",
    "keywords": ["gptq", "quantization", "large-language-models", "pytorch", "transformers"],
@ -45,6 +46,9 @@ common_setup_kwargs = {
    "python_requires": f">={python_min_version_str}"
 }
 if CUDA_VERSION:
    common_setup_kwargs['version'] += f"+cu{CUDA_VERSION}"
 requirements = [
    "accelerate>=0.19.0",
    "datasets",
@ -52,54 +56,50 @@ requirements = [
    "rouge",
    "torch>=1.13.0",
    "safetensors",
-    "transformers>=4.26.1"
+    "transformers>=4.31.0",
    "peft"
 ]
 extras_require = {
    "llama": ["transformers>=4.28.0"],
    "triton": ["triton>=2.0.0"]
 }
 include_dirs = ["autogptq_cuda"]
-if TORCH_AVAILABLE:
+additional_setup_kwargs = dict()
-    BUILD_CUDA_EXT = int(os.environ.get('BUILD_CUDA_EXT', '1')) == 1
+if BUILD_CUDA_EXT:
-    
+    from torch.utils import cpp_extension
-    additional_setup_kwargs = dict()
+    from distutils.sysconfig import get_python_lib
-    if BUILD_CUDA_EXT and (torch.cuda.is_available() or IN_GITHUB_ACTIONS):
+    conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include")
-        from torch.utils import cpp_extension
+    if os.path.isdir(conda_cuda_include_dir):
-        from distutils.sysconfig import get_python_lib
+        include_dirs.append(conda_cuda_include_dir)
-        conda_cuda_include_dir=os.path.join(get_python_lib(),"nvidia/cuda_runtime/include")
+        print(f"appending conda cuda include dir {conda_cuda_include_dir}")
-        if os.path.isdir(conda_cuda_include_dir):
+    extensions = [
-            include_dirs.append(conda_cuda_include_dir)
+        cpp_extension.CUDAExtension(
-            print(f"appending conda cuda include dir {conda_cuda_include_dir}")
+            "autogptq_cuda_64",
-        extensions = [
+            [
-            cpp_extension.CUDAExtension(
+                "autogptq_cuda/autogptq_cuda_64.cpp",
-                "autogptq_cuda",
+                "autogptq_cuda/autogptq_cuda_kernel_64.cu"
-                [
+            ]
-                    "autogptq_cuda/autogptq_cuda.cpp",
+        ),
-                    "autogptq_cuda/autogptq_cuda_kernel.cu"
+        cpp_extension.CUDAExtension(
-                ]
+            "autogptq_cuda_256",
-            )
+            [
-        ]
+                "autogptq_cuda/autogptq_cuda_256.cpp",
                "autogptq_cuda/autogptq_cuda_kernel_256.cu"
            ]
        )
    ]
-        additional_setup_kwargs = {
+    additional_setup_kwargs = {
-            "ext_modules": extensions,
+        "ext_modules": extensions,
-            "cmdclass": {'build_ext': cpp_extension.BuildExtension}
+        "cmdclass": {'build_ext': cpp_extension.BuildExtension}
-        }
+    }
-    common_setup_kwargs.update(additional_setup_kwargs)
+common_setup_kwargs.update(additional_setup_kwargs)
-    setup(
+setup(
-        packages=find_packages(),
+    packages=find_packages(),
-        install_requires=requirements,
+    install_requires=requirements,
-        extras_require=extras_require,
+    extras_require=extras_require,
-        include_dirs=include_dirs,
+    include_dirs=include_dirs,
-        **common_setup_kwargs
+    **common_setup_kwargs
-    )
+)
 else:
    setup(
        packages=find_packages(),
        install_requires=requirements,
        extras_require=extras_require,
        include_dirs=include_dirs,
        **common_setup_kwargs
    )