Merge branch 'main' into MPT

# Conflicts: # auto_gptq/modeling/__init__.py # auto_gptq/modeling/_const.py # auto_gptq/modeling/auto.py
2023-07-26 20:41:19 +03:00 · 2023-07-26 20:41:19 +03:00 · 6ff6bc8dfc
commit 6ff6bc8dfc
parent bf47892b81 a7167b108c
37 changed files with 3503 additions and 258905 deletions
--- a/.github/workflows/build_wheels_cuda.yml
+++ b/.github/workflows/build_wheels_cuda.yml
@ -1,4 +1,4 @@
-name: Build AutoGPTQ Wheels
+name: Build AutoGPTQ Wheels with CUDA

 on: workflow_dispatch

@ -51,7 +51,7 @@ jobs:
          if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH}
          $env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6+PTX'
          if ([decimal]$env:CUDA_VERSION -ge 11.8) { $env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
-          python -m build -n
+          python setup.py sdist bdist_wheel

      - uses: actions/upload-artifact@v3
        if: runner.os == 'Linux'
@ -64,37 +64,3 @@ jobs:
        with:
          name: 'windows-wheels'
          path: ./dist/*.whl
-
-  build_sdist:
-    name: Build source distribution
-    runs-on: ubuntu-latest
-    defaults:
-      run:
-        shell: pwsh
-
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          ref: 'main'
-
-      - uses: actions/setup-python@v3
-        with:
-          python-version: "3.10"
-
-      - name: Install Dependencies
-        run: |
-          python -m pip install --upgrade build setuptools wheel
-
-      - name: Build Wheel
-        run: |
-          python -m build -n
-
-      - uses: actions/upload-artifact@v3
-        with:
-          name: 'sdist'
-          path: ./dist/*.tar.gz
-
-      - uses: actions/upload-artifact@v3
-        with:
-          name: 'no-cuda-wheel'
-          path: ./dist/*.whl
--- a/README.md
+++ b/README.md
@ -12,14 +12,15 @@
    <p>
        <b>English</b> |
        <a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README_zh.md">中文</a>
-    <p>
+    </p>
 </h4>

+*<center>📣 Long time no see! 👋 Architecture upgrade, performance optimization and more new features will come in July and August, stay tune! 🥂</center>*
+
 ## News or Update

-**To experience adapter training using `auto_gptq` quantized model in advance, you can try [this branch](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration) and discuss [in here](https://github.com/PanQiWei/AutoGPTQ/issues/103), examples are [in here](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration/examples/peft).**
-
- 2023-05-25 - (In Progress) - Integrate with 🤗 peft to use gptq quantized model to train adapters, support LoRA, AdaLoRA, AdaptionPrompt, etc.
+- 2023-07-26 - (Update) - An elegant [PPL benchmark script](examples/benchmark/perplexity.py) to get results that can be fairly compared with other libraries such as `llama.cpp`.
+- 2023-06-05 - (Update) - Integrate with 🤗 peft to use gptq quantized model to train adapters, support LoRA, AdaLoRA, AdaptionPrompt, etc.
 - 2023-05-30 - (Update) - Support download/upload quantized model from/to 🤗 Hub.
 - 2023-05-27 - (Update) - Support quantization and inference for `gpt_bigcode`, `codegen` and `RefineWeb/RefineWebModel`(falcon) model types.
 - 2023-05-04 - (Update) - Support using faster cuda kernel when `not desc_act or group_size == -1`.
@ -69,11 +70,7 @@ And to make sure `autogptq_cuda` is not ever in your virtual environment, run:
 ```shell
 pip uninstall autogptq_cuda -y
 ```
-#### to support LLaMa model
-For some people want to try LLaMa and whose `transformers` version not meet the newest one that supports it, using:
-```shell
-pip install auto-gptq[llama]
-```
+
 #### to support triton speedup
 To integrate with `triton`, using:
 > warning: currently triton only supports linux; 3-bit quantization is not supported when using triton
@ -96,8 +93,6 @@ pip install .
 ```
 Like quick installation, you can also set `BUILD_CUDA_EXT=0` to disable pytorch extension building.

-Use `.[llama]` if you want to try LLaMa model.
-
 Use `.[triton]` if you want to integrate with triton and it's available on your operating system.

 </details>
@ -304,18 +299,18 @@ print(
 > 
 > for example, model_type of `WizardLM`, `vicuna` and `gpt4all` are all `llama`, hence they are all supported by `auto_gptq`.

-| model type                         | quantization | inference | peft-lora | peft-adaption_prompt |
-|------------------------------------|--------------|-----------|-----------|----------------------|
-| bloom                              | ✅            | ✅         |           |                      |
-| gpt2                               | ✅            | ✅         |           |                      |
-| gpt_neox                           | ✅            | ✅         |           |                      |
-| gptj                               | ✅            | ✅         |           |                      |
-| llama                              | ✅            | ✅         |           | ✅                    |
-| moss                               | ✅            | ✅         |           |                      |
-| opt                                | ✅            | ✅         |           |                      |
-| gpt_bigcode                        | ✅            | ✅         |           |                      |
-| codegen                            | ✅            | ✅         |           |                      |
-| falcon(RefinedWebModel/RefinedWeb) | ✅            | ✅         |           |                      |
+| model type                         | quantization | inference | peft-lora | peft-ada-lora | peft-adaption_prompt                                                                            |
+|------------------------------------|--------------|-----------|-----------|---------------|-------------------------------------------------------------------------------------------------|
+| bloom                              | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
+| gpt2                               | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
+| gpt_neox                           | ✅            | ✅         | ✅         | ✅             | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
+| gptj                               | ✅            | ✅         | ✅         | ✅             | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
+| llama                              | ✅            | ✅         | ✅         | ✅             | ✅                                                                                               |
+| moss                               | ✅            | ✅         | ✅         | ✅             | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
+| opt                                | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
+| gpt_bigcode                        | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
+| codegen                            | ✅            | ✅         | ✅         | ✅             |                                                                                                 |
+| falcon(RefinedWebModel/RefinedWeb) | ✅            | ✅         | ✅         | ✅             |                                                                                                 |

 ## Supported Evaluation Tasks
 Currently, `auto_gptq` supports: `LanguageModelingTask`, `SequenceClassificationTask` and `TextSummarizationTask`; more Tasks will come soon!
--- a/README_zh.md
+++ b/README_zh.md
@ -12,14 +12,15 @@
    <p>
        <a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README.md">English</a> |
        <b>中文</b>
-    <p>
+    </p>
 </h4>

+*<center>📣 好久不见！👋 七月和八月将会迎来架构升级，性能优化和新特性，敬请关注！🥂</center>*
+
 ## 新闻或更新

-**提前体验使用 `auto_gptq` 量化过的模型来训练适应层，你可以尝试[这个分支](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration) 并在[这里](https://github.com/PanQiWei/AutoGPTQ/issues/103)进行讨论，你也可以参考[这里](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration/examples/peft)所提供的示例脚本。**
-
- 2023-05-25 - (开发中) - 集成 🤗 peft 来使用 gptq 量化过的模型训练适应层，支持 LoRA，AdaLoRA，AdaptionPrompt 等。
+- 2023-07-26 - (更新) - 一个优雅的 [PPL 测评脚本](examples/benchmark/perplexity.py)以获得可以与诸如 `llama.cpp` 等代码库进行公平比较的结果。
+- 2023-06-05 - (更新) - 集成 🤗 peft 来使用 gptq 量化过的模型训练适应层，支持 LoRA，AdaLoRA，AdaptionPrompt 等。
 - 2023-05-30 - (更新) - 支持从 🤗 Hub 下载量化好的模型或上次量化好的模型到 🤗 Hub。
 - 2023-05-27 - (更新) - 支持以下模型的量化和推理： `gpt_bigcode`， `codegen` 以及 `RefineWeb/RefineWebModel`（falcon）。
 - 2023-05-04 - (更新) - 支持在 `not desc_act or group_size == -1` 的情况下使用更快的 cuda 算子。
@ -69,11 +70,7 @@ BUILD_CUDA_EXT=0 pip install auto-gptq
 ```shell
 pip uninstall autogptq_cuda -y
 ```
-#### 支持使用 LLaMa 模型
-若想要尝试 LLaMa 模型，但 `transformers` 版本不为支持该模型的最新版本，使用以下命令：
-```shell
-pip install auto-gptq[llama]
-```
+
 #### 支持使用 triton 加速
 若想使用 `triton` 加速模型推理，使用以下命令：
 > 警告：目前 triton 仅支持 linux 操作系统；当使用 triton 时 3-bit 数值类型的量化将不被支持
@ -96,8 +93,6 @@ pip install .
 ```
 正如在快速安装一节，你可以使用 `BUILD_CUDA_EXT=0` 来取消构建 cuda 拓展。

-如果你想要使用 LLaMa 模型，请使用 `.[llama]`。
-
 如果你想要使用 triton 加速且其能够被你的操作系统所支持，请使用 `.[triton]`。

 </details>
@ -303,18 +298,18 @@ print(
 > 
 > 比如， `WizardLM`，`vicuna` 和 `gpt4all` 模型的 `model_type` 皆为 `llama`， 因此这些模型皆被 `auto_gptq` 所支持。

-| model type                         | quantization | inference | peft-lora | peft-adaption_prompt |
-|------------------------------------|--------------|-----------|-----------|----------------------|
-| bloom                              | ✅            | ✅         |           |                      |
-| gpt2                               | ✅            | ✅         |           |                      |
-| gpt_neox                           | ✅            | ✅         |           |                      |
-| gptj                               | ✅            | ✅         |           |                      |
-| llama                              | ✅            | ✅         |           | ✅                    |
-| moss                               | ✅            | ✅         |           |                      |
-| opt                                | ✅            | ✅         |           |                      |
-| gpt_bigcode                        | ✅            | ✅         |           |                      |
-| codegen                            | ✅            | ✅         |           |                      |
-| falcon(RefinedWebModel/RefinedWeb) | ✅            | ✅         |           |                      |
+| model type                         | quantization | inference | peft-lora | peft-ada-lora | peft-adaption_prompt                                                              |
+|------------------------------------|--------------|-----------|-----------|---------------|-----------------------------------------------------------------------------------|
+| bloom                              | ✅            | ✅         | ✅         | ✅             |                                                                                   |
+| gpt2                               | ✅            | ✅         | ✅         | ✅             |                                                                                   |
+| gpt_neox                           | ✅            | ✅         | ✅         | ✅             | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
+| gptj                               | ✅            | ✅         | ✅         | ✅             | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
+| llama                              | ✅            | ✅         | ✅         | ✅             | ✅                                                                                 |
+| moss                               | ✅            | ✅         | ✅         | ✅             | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
+| opt                                | ✅            | ✅         | ✅         | ✅             |                                                                                   |
+| gpt_bigcode                        | ✅            | ✅         | ✅         | ✅             |                                                                                   |
+| codegen                            | ✅            | ✅         | ✅         | ✅             |                                                                                   |
+| falcon(RefinedWebModel/RefinedWeb) | ✅            | ✅         | ✅         | ✅             |                                                                                   |

 ## 支持的评估任务
 目前， `auto_gptq` 支持以下评估任务： `LanguageModelingTask`, `SequenceClassificationTask` 和 `TextSummarizationTask`；更多的评估任务即将到来！
--- a/auto_gptq/init.py
+++ b/auto_gptq/init.py
@ -1,2 +1,4 @@
+__version__ = "0.3.2"
 from .modeling import BaseQuantizeConfig
 from .modeling import AutoGPTQForCausalLM
+from .utils.peft_utils import get_gptq_peft_model
--- a/auto_gptq/modeling/init.py
+++ b/auto_gptq/modeling/init.py
@ -10,4 +10,6 @@ from .opt import *
 from .rw import *
 from .gpt_bigcode import *
 from .codegen import *
+from .baichuan import *
+from .internlm import *
 from .mpt import *
--- a/auto_gptq/modeling/_base.py
+++ b/auto_gptq/modeling/_base.py
@ -20,10 +20,11 @@ from transformers.modeling_utils import no_init_weights

 from ._const import *
 from ._utils import *
+from ..nn_modules.qlinear import GeneralQuantLinear
 from ..nn_modules._fused_base import FusedBaseAttentionModule, FusedBaseMLPModule
 from ..quantization import GPTQ
 from ..utils.data_utils import collate_data
-from ..utils.import_utils import TRITON_AVAILABLE
+from ..utils.import_utils import dynamically_import_QuantLinear, TRITON_AVAILABLE, AUTOGPTQ_CUDA_AVAILABLE

 logger = getLogger(__name__)

@ -112,7 +113,16 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
    fused_attn_module_type: Optional[FusedBaseAttentionModule] = None
    fused_mlp_module_type: Optional[FusedBaseMLPModule] = None

-    def __init__(self, model: PreTrainedModel, quantized: bool, quantize_config: BaseQuantizeConfig):
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        quantized: bool,
+        quantize_config: BaseQuantizeConfig,
+        is_triton_backend: bool = False,
+        injected_fused_attention: bool = False,
+        injected_fused_mlp: bool = False,
+        trainable: bool = False
+    ):
        super().__init__()

        self.model = model
@ -121,6 +131,11 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        self.quantize_config = quantize_config
        self.config = self.model.config

+        self.is_triton_backend = is_triton_backend
+        self.injected_fused_attention = injected_fused_attention
+        self.injected_fused_mlp = injected_fused_mlp
+        self.trainable = trainable
+
    @property
    def quantized(self):
        return self._quantized
@ -431,6 +446,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        repo_id: str,
        save_dir: Optional[str] = None,
        use_safetensors: Optional[bool] = True,
+        safetensors_metadata: Optional[Dict[str, str]] = None,
        commit_message: Optional[str] = "Upload of AutoGPTQ quantized model",
        use_auth_token: Optional[Union[bool, str]] = None,
        private: Optional[bool] = None,
@ -450,6 +466,10 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
            use_safetensors (`bool`, *optional*):
                Save the model using `safetensors`.
                If the model has already been saved, this parameter can be omitted.
+            safetensors_metadata: (`dict`, *optional*, defaults to `None`):
+                Pass optional metadata dictionary to be saved in the `safetensors` model file(s).
+                Metadata is optional and is purely for informational purposes. It does not affect inference.
+                If `None`, no metadata will be saved.
            commit_message (`str`, *optional*, defaults to `"Upload tool"`):
                Message to commit while pushing.
            use_auth_token (`bool` or `str`, *optional*):
@ -469,7 +489,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        
        if save_dir is not None:
            logger.info(f"Saving model to {save_dir}")
-            self.save_quantized(save_dir, use_safetensors)
+            self.save_quantized(save_dir, use_safetensors, safetensors_metadata)

        repo_url = create_repo(
            repo_id=repo_id, token=token, private=private, exist_ok=True, repo_type="model"
@ -492,7 +512,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
                repo_type="model",
            )

-    def save_quantized(self, save_dir: str, use_safetensors: bool = False):
+    def save_quantized(self, save_dir: str, use_safetensors: bool = False, safetensors_metadata: Optional[Dict[str, str]] = None):
        """save quantized model and configs to local disk"""
        os.makedirs(save_dir, exist_ok=True)

@ -506,7 +526,42 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
            model_save_name = model_base_name + ".safetensors"
            state_dict = self.model.state_dict()
            state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()}
-            safe_save(state_dict, join(save_dir, model_save_name))
+            if safetensors_metadata is None:
+                safetensors_metadata = {}
+            elif not isinstance(safetensors_metadata, dict):
+                raise TypeError("safetensors_metadata must be a dictionary.")
+            else:
+                logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
+                new_safetensors_metadata = {}
+                converted_keys = False
+                for key, value in safetensors_metadata.items():
+                    if not isinstance(key, str) or not isinstance(value, str):
+                        converted_keys = True
+                        try:
+                            new_key = str(key)
+                            new_value = str(value)
+                        except Exception as e:
+                            raise TypeError(f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}")
+                        if new_key in new_safetensors_metadata:
+                            logger.warning(f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting.")
+                        new_safetensors_metadata[new_key] = new_value
+                safetensors_metadata = new_safetensors_metadata
+                if converted_keys:
+                    logger.debug(f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}")
+
+            # Format is required to enable Accelerate to load the metadata
+            # otherwise it raises an OSError
+            safetensors_metadata['format'] = "pt"
+
+            # Store the quantization configuration as safetensors metadata
+            from auto_gptq import __version__
+            safetensors_metadata['auto_gptq_version'] = str(__version__)
+            safetensors_metadata['gptq_bits'] = str(self.quantize_config.bits)
+            safetensors_metadata['gptq_group_size'] = str(self.quantize_config.group_size)
+            safetensors_metadata['gptq_desc_act'] = str(self.quantize_config.desc_act)
+            safetensors_metadata['gptq_damp_percent'] = str(self.quantize_config.damp_percent)
+
+            safe_save(state_dict, join(save_dir, model_save_name), safetensors_metadata)
        else:
            model_save_name = model_base_name + ".bin"
            torch.save(self.model.state_dict(), join(save_dir, model_save_name))
@ -516,10 +571,10 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        self.quantize_config.model_name_or_path = save_dir
        self.quantize_config.model_file_base_name = model_base_name

-    def save_pretrained(self, save_dir: str, use_safetensors: bool = False, **kwargs):
+    def save_pretrained(self, save_dir: str, use_safetensors: bool = False, safetensors_metadata: Optional[Dict[str, str]] = None, **kwargs):
        """alias of save_quantized"""
        logger.warning("you are using save_pretrained, which will re-direct to save_quantized.")
-        self.save_quantized(save_dir, use_safetensors)
+        self.save_quantized(save_dir, use_safetensors, safetensors_metadata)

    @classmethod
    def from_pretrained(
@ -543,7 +598,29 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        torch.nn.init.uniform_ = skip
        torch.nn.init.normal_ = skip

-        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+        # Parameters related to loading from Hugging Face Hub
+        cache_dir = model_init_kwargs.pop("cache_dir", None)
+        force_download = model_init_kwargs.pop("force_download", False)
+        resume_download = model_init_kwargs.pop("resume_download", False)
+        proxies = model_init_kwargs.pop("proxies", None)
+        local_files_only = model_init_kwargs.pop("local_files_only", False)
+        use_auth_token = model_init_kwargs.pop("use_auth_token", None)
+        revision = model_init_kwargs.pop("revision", None)
+        subfolder = model_init_kwargs.pop("subfolder", "")
+        commit_hash = model_init_kwargs.pop("_commit_hash", None)
+
+        cached_file_kwargs = {
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "proxies": proxies,
+            "resume_download": resume_download,
+            "local_files_only": local_files_only,
+            "use_auth_token": use_auth_token,
+            "revision": revision,
+            "subfolder": subfolder,
+        }
+
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True, **cached_file_kwargs)
        if config.model_type not in SUPPORTED_MODELS:
            raise TypeError(f"{config.model_type} isn't supported yet.")

@ -579,7 +656,9 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):

        torch.cuda.empty_cache()

-        model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **model_init_kwargs)
+        merged_kwargs = {**model_init_kwargs, **cached_file_kwargs}
+        model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **merged_kwargs)
+
        model_config = model.config.to_dict()
        seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
        if any([k in model_config for k in seq_len_keys]):
@ -597,8 +676,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
    @classmethod
    def from_quantized(
        cls,
-        model_name_or_path: Optional[str] = None,
-        save_dir: Optional[str] = None,
+        model_name_or_path: Optional[str],
        device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
        max_memory: Optional[dict] = None,
        device: Optional[Union[str, int]] = None,
@ -613,6 +691,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        use_safetensors: bool = False,
        trust_remote_code: bool = False,
        warmup_triton: bool = False,
+        trainable: bool = False,
        **kwargs
    ):
        """load quantized model from local disk"""
@ -628,20 +707,25 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        subfolder = kwargs.pop("subfolder", "")
        commit_hash = kwargs.pop("_commit_hash", None)

+        cached_file_kwargs = {
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "proxies": proxies,
+            "resume_download": resume_download,
+            "local_files_only": local_files_only,
+            "use_auth_token": use_auth_token,
+            "revision": revision,
+            "subfolder": subfolder,
+            "_raise_exceptions_for_missing_entries": False,
+            "_commit_hash": commit_hash,
+        }
+            
        if use_triton and not TRITON_AVAILABLE:
            logger.warning("triton is not installed, reset use_triton to False")
            use_triton = False

        # == step1: prepare configs and file names == #
-        if model_name_or_path and save_dir:
-            logger.warning("save_dir will be ignored because model_name_or_path is explicit specified.")
-        if not model_name_or_path and save_dir:
-            model_name_or_path = save_dir
-            warnings.warn("save_dir is deprecated and will be removed in version 0.3.0", PendingDeprecationWarning, stacklevel=2)
-        if not model_name_or_path and not save_dir:
-            raise ValueError("at least one of model_name_or_path or save_dir should be specified.")
-        
-        config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
+        config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code, **cached_file_kwargs)

        if config.model_type not in SUPPORTED_MODELS:
            raise TypeError(f"{config.model_type} isn't supported yet.")
@ -670,25 +754,11 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        resolved_archive_file = None
        if is_local:
            model_save_name = join(model_name_or_path, model_basename)
-
            for ext in extensions:
                if isfile(model_save_name + ext):
                    resolved_archive_file = model_save_name + ext
                    break
        else: # remote
-            cached_file_kwargs = {
-                "cache_dir": cache_dir,
-                "force_download": force_download,
-                "proxies": proxies,
-                "resume_download": resume_download,
-                "local_files_only": local_files_only,
-                "use_auth_token": use_auth_token,
-                "revision": revision,
-                "subfolder": subfolder,
-                "_raise_exceptions_for_missing_entries": False,
-                "_commit_hash": commit_hash,
-            }
-            
            for ext in extensions:
                resolved_archive_file = cached_file(model_name_or_path, model_basename + ext, **cached_file_kwargs)
                if resolved_archive_file is not None:
@ -699,6 +769,9 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
                
        model_save_name = resolved_archive_file

+        if not use_triton and trainable:
+            logger.warning("QuantLinear with cuda backend not support trainable mode yet, Switch to the pytorch backend.")
+
        # == step2: convert model to gptq-model (replace Linear with QuantLinear) == #
        def skip(*args, **kwargs):
            pass
@ -734,7 +807,8 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
                quantize_config.group_size,
                use_triton=use_triton,
                use_cuda_fp16=use_cuda_fp16,
-                desc_act=quantize_config.desc_act
+                desc_act=quantize_config.desc_act,
+                trainable=trainable
            )
            model.tie_weights()

@ -794,6 +868,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        # == step5: (optional) inject optimized module == #
        if inject_fused_attention:
            if cls.fused_attn_module_type is None:
+                inject_fused_attention = False
                logger.warning(f"{cls.__name__} hasn't fused attention module yet, will skip inject fused attention.")
            else:
                cls.fused_attn_module_type.inject_to_model(
@ -801,10 +876,12 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
                    use_triton=use_triton,
                    group_size=quantize_config.group_size,
                    use_cuda_fp16=use_cuda_fp16,
-                    desc_act=quantize_config.desc_act
+                    desc_act=quantize_config.desc_act,
+                    trainable=trainable
                )
        if inject_fused_mlp:
            if cls.fused_mlp_module_type is None:
+                inject_fused_mlp = False
                logger.warning(f"{cls.__name__} hasn't fused mlp module yet, will skip inject fused mlp.")
            else:
                cls.fused_mlp_module_type.inject_to_model(
@ -815,13 +892,26 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
        model.eval()
        # == step6: (optional) warmup triton == #
        if use_triton and warmup_triton:
-            from ..nn_modules.qlinear_triton import QuantLinear
+            from ..nn_modules.qlinear.qlinear_triton import QuantLinear
            QuantLinear.warmup(model, seqlen=model.seqlen)

            if inject_fused_mlp and cls.fused_mlp_module_type is not None:
                cls.fused_mlp_module_type.warmup(model, seqlen=model.seqlen)

-        return cls(model, True, quantize_config)
+        # == step7: make model compatible with peft
+        cls.make_sure_compatible_with_peft(
+            model, use_triton, quantize_config.desc_act, quantize_config.group_size
+        )
+
+        return cls(
+            model,
+            True,
+            quantize_config,
+            is_triton_backend=use_triton,
+            injected_fused_attention=inject_fused_attention,
+            injected_fused_mlp=inject_fused_mlp and use_triton,
+            trainable=trainable
+        )

    def warmup_triton(self, enabled: bool = True):
        if not enabled:
@ -830,11 +920,34 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
            logger.warning(f"triton is not available, skip warmup stage directly.")
            return

-        from ..nn_modules.qlinear_triton import QuantLinear
+        from ..nn_modules.qlinear.qlinear_triton import QuantLinear
        QuantLinear.warmup(self.model, seqlen=self.model.seqlen)

        if self.fused_mlp_module_type is not None:
            self.fused_mlp_module_type.warmup(self.model, seqlen=self.model.seqlen)

+    def enable_trainable_mode(self, enabled: bool = True):
+        if not self.is_triton_backend and enabled:
+            raise NotImplementedError("For now, trainable mode only supports triton backend.")
+        for n, m in self.model.named_modules():
+            if hasattr(m, "trainable"):
+                setattr(m, "trainable", enabled)
+
+    def disable_trainable_mode(self):
+        self.enable_trainable_mode(enabled=False)
+
+    @staticmethod
+    def make_sure_compatible_with_peft(model: PreTrainedModel, use_triton: bool, desc_act: bool, group_size: int):
+        GeneralQuantLinear.inject_to_model(
+            model,
+            dynamically_import_QuantLinear(use_triton, desc_act, group_size)
+        )
+
+    def __getattr__(self, item):
+        try:
+            return super().__getattr__(item)
+        except:
+            return getattr(self.model, item)
+

 __all__ = ["BaseGPTQForCausalLM", "BaseQuantizeConfig"]
--- a/auto_gptq/modeling/_const.py
+++ b/auto_gptq/modeling/_const.py
@ -1,12 +1,27 @@
 from packaging.version import parse as parse_version

 from torch import device
-from transformers import __version__ as transformers_version
+
+from ..utils.import_utils import compare_transformers_version

 CPU = device("cpu")
 CUDA_0 = device("cuda:0")

-SUPPORTED_MODELS = ["bloom", "gptj", "gpt2", "gpt_neox", "opt", "moss", "gpt_bigcode", "codegen", "RefinedWebModel", "RefinedWeb", "mpt"]
+SUPPORTED_MODELS = [
+    "bloom",
+    "gptj",
+    "gpt2",
+    "gpt_neox",
+    "opt",
+    "moss",
+    "gpt_bigcode",
+    "codegen",
+    "RefinedWebModel",
+    "RefinedWeb",
+    "baichuan",
+    "internlm",
+    "mpt",
+]
 if compare_transformers_version("v4.28.0", op="ge"):
    SUPPORTED_MODELS.append("llama")

--- a/auto_gptq/modeling/_utils.py
+++ b/auto_gptq/modeling/_utils.py
@ -50,7 +50,17 @@ def get_module_by_name_suffix(model, module_name: str):
            return module


-def make_quant(module, names, bits, group_size, name='', use_triton=False, use_cuda_fp16=True, desc_act=False):
+def make_quant(
+    module,
+    names,
+    bits,
+    group_size,
+    name='',
+    use_triton=False,
+    use_cuda_fp16=True,
+    desc_act=False,
+    trainable=False
+):
    QuantLinear = dynamically_import_QuantLinear(use_triton=use_triton, desc_act=desc_act, group_size=group_size)

    if isinstance(module, QuantLinear):
@ -71,13 +81,25 @@ def make_quant(module, names, bits, group_size, name='', use_triton=False, use_c
                in_features = tmp.weight.shape[0]
                out_features = tmp.weight.shape[1]
            if (not(desc_act) or group_size == -1) and not use_triton:
-                new_layer = QuantLinear(bits, group_size, in_features, out_features, True, use_cuda_fp16=use_cuda_fp16)
+                new_layer = QuantLinear(
+                    bits, group_size, in_features, out_features, True, use_cuda_fp16=use_cuda_fp16, trainable=trainable
+                )
            else:
-                new_layer = QuantLinear(bits, group_size, in_features, out_features, True)
+                new_layer = QuantLinear(bits, group_size, in_features, out_features, True, trainable=trainable)
            new_layer.device = ori_layer_device
            setattr(module, attr, new_layer.to(ori_layer_device))
    for name1, child in module.named_children():
-        make_quant(child, names, bits, group_size, name + '.' + name1 if name != '' else name1, use_triton=use_triton, use_cuda_fp16=use_cuda_fp16,desc_act=desc_act)
+        make_quant(
+            child,
+            names,
+            bits,
+            group_size,
+            name + '.' + name1 if name != '' else name1,
+            use_triton=use_triton,
+            use_cuda_fp16=use_cuda_fp16,
+            desc_act=desc_act,
+            trainable=trainable
+        )


 def pack_model(
--- a/auto_gptq/modeling/auto.py
+++ b/auto_gptq/modeling/auto.py
@ -1,4 +1,5 @@
-from typing import Optional
+from inspect import signature
+from typing import Dict, Optional, Union

 from ._base import BaseQuantizeConfig, BaseGPTQForCausalLM
 from ._utils import check_and_get_model_type
@ -12,6 +13,8 @@ from .moss import MOSSGPTQForCausalLM
 from .opt import OPTGPTQForCausalLM
 from .rw import RWGPTQForCausalLM
 from .gpt_bigcode import GPTBigCodeGPTQForCausalLM
+from .baichuan import BaiChuanGPTQForCausalLM
+from .internlm import InternLMGPTQForCausalLM
 from .mpt import MPTGPTQForCausalLM


@ -26,8 +29,10 @@ GPTQ_CAUSAL_LM_MODEL_MAP = {
    "gpt_bigcode": GPTBigCodeGPTQForCausalLM,
    "codegen": CodeGenGPTQForCausalLM,
    "RefinedWebModel": RWGPTQForCausalLM,
-    "RefinedWeb":RWGPTQForCausalLM,
-    "mpt": MPTGPTQForCausalLM
+    "RefinedWeb": RWGPTQForCausalLM,
+    "baichuan": BaiChuanGPTQForCausalLM,
+    "internlm": InternLMGPTQForCausalLM,
+    "mpt": MPTGPTQForCausalLM,
 }


@ -48,7 +53,9 @@ class AutoGPTQForCausalLM:
        trust_remote_code: bool = False,
        **model_init_kwargs
    ) -> BaseGPTQForCausalLM:
-        model_type = check_and_get_model_type(pretrained_model_name_or_path, trust_remote_code)
+        model_type = check_and_get_model_type(
+            pretrained_model_name_or_path, trust_remote_code
+        )
        return GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_pretrained(
            pretrained_model_name_or_path=pretrained_model_name_or_path,
            quantize_config=quantize_config,
@ -60,8 +67,7 @@ class AutoGPTQForCausalLM:
    @classmethod
    def from_quantized(
        cls,
-        model_name_or_path: Optional[str] = None,
-        save_dir: Optional[str] = None,
+        model_name_or_path: Optional[str],
        device_map: Optional[Union[str, Dict[str, Union[str, int]]]] = None,
        max_memory: Optional[dict] = None,
        device: Optional[Union[str, int]] = None,
@ -75,14 +81,32 @@ class AutoGPTQForCausalLM:
        use_safetensors: bool = False,
        trust_remote_code: bool = False,
        warmup_triton: bool = False,
+        trainable: bool = False,
        **kwargs
    ) -> BaseGPTQForCausalLM:
-        model_type = check_and_get_model_type(save_dir or model_name_or_path, trust_remote_code)
+        model_type = check_and_get_model_type(model_name_or_path, trust_remote_code)
        quant_func = GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized
-        keywords = {key: kwargs[key] for key in signature(quant_func).parameters if key in kwargs}
+        # A static list of kwargs needed for huggingface_hub
+        huggingface_kwargs = [
+            "cache_dir",
+            "force_download",
+            "proxies",
+            "resume_download",
+            "local_files_only",
+            "use_auth_token",
+            "revision",
+            "subfolder",
+            "_raise_exceptions_for_missing_entries",
+            "_commit_hash"
+        ]
+        # TODO: do we need this filtering of kwargs? @PanQiWei is there a reason we can't just pass all kwargs?
+        keywords = {
+            key: kwargs[key]
+            for key in list(signature(quant_func).parameters.keys()) + huggingface_kwargs
+            if key in kwargs
+        }
        return quant_func(
            model_name_or_path=model_name_or_path,
-            save_dir=save_dir,
            device_map=device_map,
            max_memory=max_memory,
            device=device,
@ -96,6 +120,7 @@ class AutoGPTQForCausalLM:
            use_safetensors=use_safetensors,
            trust_remote_code=trust_remote_code,
            warmup_triton=warmup_triton,
+            trainable=trainable,
            **keywords
        )

--- a/auto_gptq/modeling/baichuan.py
+++ b/auto_gptq/modeling/baichuan.py
@ -0,0 +1,16 @@
+from ._base import *
+
+
+class BaiChuanGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "DecoderLayer"
+    layers_block_name = "model.layers"
+    outside_layer_modules = ["model.embed_tokens", "model.norm"]
+    inside_layer_modules = [
+        ["self_attn.W_pack"],
+        ["self_attn.o_proj"],
+        ["mlp.up_proj", "mlp.gate_proj"],
+        ["mlp.down_proj"]
+    ]
+
+
+__all__ = ["BaiChuanGPTQForCausalLM"]
--- a/auto_gptq/modeling/internlm.py
+++ b/auto_gptq/modeling/internlm.py
@ -0,0 +1,16 @@
+from ._base import *
+
+
+class InternLMGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = "InternLMDecoderLayer"
+    layers_block_name = "model.layers"
+    outside_layer_modules = ["model.embed_tokens", "model.norm"]
+    inside_layer_modules = [
+        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
+        ["self_attn.o_proj"],
+        ["mlp.up_proj", "mlp.gate_proj"],
+        ["mlp.down_proj"],
+    ]
+
+
+__all__ = ["InternLMGPTQForCausalLM"]
--- a/auto_gptq/nn_modules/_fused_base.py
+++ b/auto_gptq/nn_modules/_fused_base.py
@ -18,7 +18,16 @@ class FusedBaseModule(nn.Module, TritonModuleMixin):
 class FusedBaseAttentionModule(FusedBaseModule):
    @classmethod
    @abstractmethod
-    def inject_to_model(cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, **kwargs):
+    def inject_to_model(
+        cls,
+        model,
+        use_triton=False,
+        group_size=-1,
+        use_cuda_fp16=True,
+        desc_act=False,
+        trainable=False,
+        **kwargs
+    ):
        raise NotImplementedError()

    @classmethod
--- a/auto_gptq/nn_modules/fused_gptj_attn.py
+++ b/auto_gptq/nn_modules/fused_gptj_attn.py
@ -226,7 +226,16 @@ class FusedGPTJAttentionForQuantizedModel(FusedBaseAttentionModule):
        return outputs  # a, present, (attentions)

    @classmethod
-    def inject_to_model(cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, **kwargs):
+    def inject_to_model(
+        cls,
+        model,
+        use_triton=False,
+        group_size=-1,
+        use_cuda_fp16=True,
+        desc_act=False,
+        trainable=False,
+        **kwargs
+    ):
        config = model.config
        QuantLinear = dynamically_import_QuantLinear(use_triton=use_triton, desc_act=desc_act, group_size=group_size)

@ -253,7 +262,7 @@ class FusedGPTJAttentionForQuantizedModel(FusedBaseAttentionModule):
                q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
                True if q_proj.bias is not None else False,
            )
-            qlinear_kwargs = dict()
+            qlinear_kwargs = {"trainable": trainable}
            if (not desc_act or group_size == -1) and not use_triton:
                qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
            qkv_proj = QuantLinear(*qlinear_args, **qlinear_kwargs)
--- a/auto_gptq/nn_modules/fused_llama_attn.py
+++ b/auto_gptq/nn_modules/fused_llama_attn.py
@ -126,7 +126,16 @@ class FusedLlamaAttentionForQuantizedModel(FusedBaseAttentionModule):
        return attn_output, attn_weights, past_key_value

    @classmethod
-    def inject_to_model(cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, **kwargs):
+    def inject_to_model(
+        cls,
+        model,
+        use_triton=False,
+        group_size=-1,
+        use_cuda_fp16=True,
+        desc_act=False,
+        trainable=False,
+        **kwargs
+    ):
        """
        Replace all LlamaAttention modules with QuantLlamaAttention modules, fusing the q, k, v projections.
        """
@ -153,7 +162,7 @@ class FusedLlamaAttentionForQuantizedModel(FusedBaseAttentionModule):
                q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
                True if q_proj.bias is not None else False,
            )
-            qlinear_kwargs = dict()
+            qlinear_kwargs = {"trainable": trainable}
            if (not desc_act or group_size == -1) and not use_triton:
                qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
            qkv_layer = QuantLinear(*qlinear_args, **qlinear_kwargs)
--- a/auto_gptq/nn_modules/fused_llama_mlp.py
+++ b/auto_gptq/nn_modules/fused_llama_mlp.py
@ -237,14 +237,6 @@ class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
        up_proj,
    ):
        super().__init__()
-        self.register_buffer('gate_proj_qweight', gate_proj.qweight)
-        self.register_buffer('gate_proj_scales', gate_proj.scales)
-        self.register_buffer('gate_proj_qzeros', gate_proj.qzeros)
-        self.register_buffer('gate_proj_g_idx', gate_proj.g_idx)
-        self.register_buffer('up_proj_qweight', up_proj.qweight)
-        self.register_buffer('up_proj_scales', up_proj.scales)
-        self.register_buffer('up_proj_qzeros', up_proj.qzeros)
-        self.register_buffer('up_proj_g_idx', up_proj.g_idx)

        self.infeatures = gate_proj.infeatures
        self.intermediate_size = gate_proj.outfeatures
@ -252,6 +244,8 @@ class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
        self.bits = gate_proj.bits
        self.maxq = gate_proj.maxq

+        self.gate_proj = gate_proj
+        self.up_proj = up_proj
        self.down_proj = down_proj

    def forward(self, x):
@ -266,40 +260,20 @@ class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
            c = torch.empty((M, N), device=x.device, dtype=torch.float16)
            grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), )
            quant_fused_matmul_248_kernel[grid](
-                x, c, self.gate_proj_qweight,
-                self.gate_proj_scales, self.gate_proj_qzeros, self.gate_proj_g_idx,
-                self.up_proj_qweight,
-                self.up_proj_scales, self.up_proj_qzeros, self.up_proj_g_idx,
+                x, c, self.gate_proj.qweight,
+                self.gate_proj.scales, self.gate_proj.qzeros, self.gate_proj.g_idx,
+                self.up_proj.qweight,
+                self.up_proj.scales, self.up_proj.qzeros, self.up_proj.g_idx,
                M, N, K,
                self.bits, self.maxq,
                x.stride(0), x.stride(1),
-                self.gate_proj_qweight.stride(0), self.gate_proj_qweight.stride(1),
+                self.gate_proj.qweight.stride(0), self.gate_proj.qweight.stride(1),
                c.stride(0), c.stride(1),
-                self.gate_proj_scales.stride(0), self.gate_proj_qzeros.stride(0)
+                self.gate_proj.scales.stride(0), self.gate_proj.qzeros.stride(0)
            )
            c = c.reshape(out_shape)
            return c

-    def fused2cuda(self):
-        self.gate_proj_qweight = self.gate_proj_qweight.cuda()
-        self.gate_proj_scales = self.gate_proj_scales.cuda()
-        self.gate_proj_qzeros = self.gate_proj_qzeros.cuda()
-        self.gate_proj_g_idx = self.gate_proj_g_idx.cuda()
-        self.up_proj_qweight = self.up_proj_qweight.cuda()
-        self.up_proj_scales = self.up_proj_scales.cuda()
-        self.up_proj_qzeros = self.up_proj_qzeros.cuda()
-        self.up_proj_g_idx = self.up_proj_g_idx.cuda()
-
-    def fused2cpu(self):
-        self.gate_proj_qweight = self.gate_proj_qweight.cpu()
-        self.gate_proj_scales = self.gate_proj_scales.cpu()
-        self.gate_proj_qzeros = self.gate_proj_qzeros.cpu()
-        self.gate_proj_g_idx = self.gate_proj_g_idx.cpu()
-        self.up_proj_qweight = self.up_proj_qweight.cpu()
-        self.up_proj_scales = self.up_proj_scales.cpu()
-        self.up_proj_qzeros = self.up_proj_qzeros.cpu()
-        self.up_proj_g_idx = self.up_proj_g_idx.cpu()
-
    @classmethod
    def inject_to_model(cls, model, use_triton=False, **kwargs):
        if not use_triton:
--- a/auto_gptq/nn_modules/qlinear/init.py
+++ b/auto_gptq/nn_modules/qlinear/init.py
@ -0,0 +1,57 @@
+import torch.nn as nn
+
+
+class GeneralQuantLinear(nn.Linear):
+    def __init__(self, quant_linear_module):
+        super().__init__(
+            in_features=quant_linear_module.infeatures,
+            out_features=quant_linear_module.outfeatures,
+            bias=True
+        )
+
+        self.infeatures = quant_linear_module.infeatures
+        self.outfeatures = quant_linear_module.outfeatures
+        self.bits = quant_linear_module.bits
+        self.group_size = quant_linear_module.group_size
+        self.maxq = quant_linear_module.maxq
+
+        self.weight.requires_grad = False
+
+        self.weight.data = quant_linear_module.qweight
+        self.qweight = self.weight
+        self.bias.data = quant_linear_module.bias
+
+        self.qweight.requires_grad = False
+        self.bias.requires_grad = False
+
+        self.qzeros = quant_linear_module.qzeros
+        self.scales = quant_linear_module.scales
+        self.g_idx = quant_linear_module.g_idx
+
+        if hasattr(quant_linear_module, "wf"):
+            self.wf = quant_linear_module.wf
+        if hasattr(quant_linear_module, "kernel_switch_threshold"):
+            self.kernel_switch_threshold = quant_linear_module.kernel_switch_threshold
+        if hasattr(quant_linear_module, "autogptq_cuda_available"):
+            self.autogptq_cuda_available = quant_linear_module.autogptq_cuda_available
+
+        self.trainable = quant_linear_module.trainable
+
+        self.forward = quant_linear_module.forward
+
+    @classmethod
+    def inject_to_model(cls, model, target_module_type):
+        for name, m in model.named_modules():
+            if not isinstance(m, target_module_type):
+                continue
+            new_m = cls(m)
+            if '.' in name:
+                parent_name = name.rsplit('.', 1)[0]
+                child_name = name[len(parent_name) + 1:]
+                parent = model.get_submodule(parent_name)
+            else:
+                parent_name = ''
+                parent = model
+                child_name = name
+
+            setattr(parent, child_name, new_m)
--- a/auto_gptq/nn_modules/qlinear/qlinear_cuda.py
+++ b/auto_gptq/nn_modules/qlinear/qlinear_cuda.py
@ -9,11 +9,13 @@ import transformers
 logger = getLogger(__name__)

 try:
-    import autogptq_cuda
-
+    import autogptq_cuda_256
+    import autogptq_cuda_64
    _autogptq_cuda_available = True
 except ImportError:
    logger.warning('CUDA extension not installed.')
+    autogptq_cuda_256 = None
+    autogptq_cuda_64 = None
    _autogptq_cuda_available = False


@ -26,10 +28,14 @@ class QuantLinear(nn.Module):
        outfeatures,
        bias,
        kernel_switch_threshold=128,
+        trainable=False
    ):
        super().__init__()
+        global _autogptq_cuda_available
        if bits not in [2, 3, 4, 8]:
            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        if trainable:
+            _autogptq_cuda_available = False

        self.infeatures = infeatures
        self.outfeatures = outfeatures
@ -73,9 +79,15 @@ class QuantLinear(nn.Module):

        self.kernel_switch_threshold = kernel_switch_threshold
        self.autogptq_cuda_available = _autogptq_cuda_available
+        
+        self.autogptq_cuda = autogptq_cuda_256
        if infeatures % 256 != 0 or outfeatures % 256 != 0:
+            self.autogptq_cuda = autogptq_cuda_64
+        if infeatures % 64 != 0 or outfeatures % 64 != 0:
            self.autogptq_cuda_available = False

+        self.trainable = trainable
+
    def pack(self, linear, scales, zeros, g_idx=None):
        W = linear.weight.data.clone()
        if isinstance(linear, nn.Conv2d):
@ -184,13 +196,13 @@ class QuantLinear(nn.Module):
        ):
            out = torch.zeros((x.shape[0], self.outfeatures), device=x.device, dtype=torch.float32)
            if self.bits == 2:
-                autogptq_cuda.vecquant2matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
+                self.autogptq_cuda.vecquant2matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
            elif self.bits == 3:
-                autogptq_cuda.vecquant3matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
+                self.autogptq_cuda.vecquant3matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
            elif self.bits == 4:
-                autogptq_cuda.vecquant4matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
+                self.autogptq_cuda.vecquant4matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
            elif self.bits == 8:
-                autogptq_cuda.vecquant8matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
+                self.autogptq_cuda.vecquant8matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
            else:
                raise NotImplementedError("Only 2,3,4,8 bits are supported.")
        else:
--- a/auto_gptq/nn_modules/qlinear/qlinear_cuda_old.py
+++ b/auto_gptq/nn_modules/qlinear/qlinear_cuda_old.py
@ -7,15 +7,17 @@ import torch.nn as nn
 import transformers

 logger = getLogger(__name__)
-
 try:
-    import autogptq_cuda
-
+    import autogptq_cuda_256
+    import autogptq_cuda_64
    _autogptq_cuda_available = True
 except ImportError:
    logger.warning('CUDA extension not installed.')
+    autogptq_cuda_256 = None
+    autogptq_cuda_64 = None
    _autogptq_cuda_available = False

+
 class QuantLinear(nn.Module):
    def __init__(
        self,
@ -25,12 +27,15 @@ class QuantLinear(nn.Module):
        outfeatures,
        bias,
        use_cuda_fp16=True,
-        kernel_switch_threshold=128
+        kernel_switch_threshold=128,
+        trainable=False
    ):
-
        super().__init__()
+        global _autogptq_cuda_available
        if bits not in [2, 3, 4, 8]:
            raise NotImplementedError("Only 2,3,4,8 bits are supported.")
+        if trainable:
+            _autogptq_cuda_available = False
        self.infeatures = infeatures
        self.outfeatures = outfeatures
        self.bits = bits
@ -77,10 +82,21 @@ class QuantLinear(nn.Module):

        self.kernel_switch_threshold = kernel_switch_threshold
        self.autogptq_cuda_available = _autogptq_cuda_available
+        self.autogptq_cuda = autogptq_cuda_256
        if infeatures % 256 != 0 or outfeatures % 256 != 0:
+            self.autogptq_cuda = autogptq_cuda_64
+        if infeatures % 64 != 0 or outfeatures % 64 != 0:
            self.autogptq_cuda_available = False

+        self.trainable = trainable
+
    def pack(self, linear, scales, zeros, g_idx):
+        W = linear.weight.data.clone()
+        if isinstance(linear, nn.Conv2d):
+            W = W.flatten(1)
+        if isinstance(linear, transformers.pytorch_utils.Conv1D):
+            W = W.t()
+
        scales = scales.t().contiguous()
        zeros = zeros.t().contiguous()
        scale_zeros = zeros * scales
@ -93,7 +109,7 @@ class QuantLinear(nn.Module):
            g_idx = idx // self.group_size
            intweight.append(
                torch.round(
-                    (linear.weight.data[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]
+                    (W[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]
                ).to(torch.int)[:, None]
            )
        intweight = torch.cat(intweight, dim=1)
@ -182,24 +198,24 @@ class QuantLinear(nn.Module):
            if self.use_cuda_fp16:
                x = x.half()
                if self.bits == 2:
-                    autogptq_cuda.vecquant2matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
+                    self.autogptq_cuda.vecquant2matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
                elif self.bits == 3:
-                    autogptq_cuda.vecquant3matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
+                    self.autogptq_cuda.vecquant3matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
                elif self.bits == 4:
-                    autogptq_cuda.vecquant4matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
+                    self.autogptq_cuda.vecquant4matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)

                else:
                    raise NotImplementedError("Only 2,3,4 bits are supported.")
            else:
                x = x.float()
                if self.bits == 2:
-                    autogptq_cuda.vecquant2matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
+                    self.autogptq_cuda.vecquant2matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
                elif self.bits == 3:
-                    autogptq_cuda.vecquant3matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
+                    self.autogptq_cuda.vecquant3matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
                elif self.bits == 4:
-                    autogptq_cuda.vecquant4matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
+                    self.autogptq_cuda.vecquant4matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
                elif self.bits == 8:
-                    autogptq_cuda.vecquant8matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
+                    self.autogptq_cuda.vecquant8matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
                else:
                    raise NotImplementedError("Only 2,3,4,8 bits are supported.")
        else:
--- a/auto_gptq/nn_modules/qlinear/qlinear_triton.py
+++ b/auto_gptq/nn_modules/qlinear/qlinear_triton.py
@ -1,17 +1,20 @@
 import math
+from logging import getLogger
+
 import numpy as np
 import torch
 import torch.nn as nn
 import transformers
-from torch.cuda.amp import custom_bwd, custom_fwd
-from logging import getLogger

-from .triton_utils.mixin import TritonModuleMixin
+from ..triton_utils.mixin import TritonModuleMixin

 logger = getLogger(__name__)

 try:
-    from .triton_utils.kernels import quant_matmul_248, transpose_quant_matmul_248, QuantLinearFunction
+    from ..triton_utils.kernels import (
+        quant_matmul_248, transpose_quant_matmul_248, quant_matmul_inference_only_248,
+        QuantLinearFunction, QuantLinearInferenceOnlyFunction
+    )
 except ImportError:
    logger.error('triton not installed.')
    raise
@ -24,13 +27,14 @@ class QuantLinear(nn.Module, TritonModuleMixin):
        group_size,
        infeatures,
        outfeatures,
-        bias
+        bias,
+        trainable=False
    ):
        super().__init__()
        if bits not in [2, 4, 8]:
            raise NotImplementedError("Only 2,4,8 bits are supported.")
-        if infeatures % 256 != 0 or outfeatures % 256 != 0:
-            raise NotImplementedError("in_feature or out_feature must be divisible by 256.")
+        if infeatures % 32 != 0 or outfeatures % 32 != 0:
+            raise NotImplementedError("in_feature and out_feature must be divisible by 32.")
        self.infeatures = infeatures
        self.outfeatures = outfeatures
        self.bits = bits
@ -58,6 +62,8 @@ class QuantLinear(nn.Module, TritonModuleMixin):
        else:
            self.bias = None

+        self.trainable = trainable
+
    def pack(self, linear, scales, zeros, g_idx=None):
        W = linear.weight.data.clone()
        if isinstance(linear, nn.Conv2d):
@ -122,7 +128,8 @@ class QuantLinear(nn.Module, TritonModuleMixin):

    def forward(self, x):
        out_shape = x.shape[:-1] + (self.outfeatures,)
-        out = QuantLinearFunction.apply(
+        quant_linear_fn = QuantLinearFunction if self.trainable else QuantLinearInferenceOnlyFunction
+        out = quant_linear_fn.apply(
            x.reshape(-1, x.shape[-1]),
            self.qweight,
            self.scales,
@ -160,11 +167,14 @@ class QuantLinear(nn.Module, TritonModuleMixin):
            for m in tqdm(range(0, math.ceil(math.log2(seqlen)) + 1)):
                m = 2 ** m
                for (k, n), (qweight, scales, qzeros, g_idx, bits, maxq) in kn_values.items():
+                    if transpose:
                        a = torch.randn(m, k, dtype=torch.float16, device=model.device)
                        quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
-                    if transpose:
                        a = torch.randn(m, n, dtype=torch.float16, device=model.device)
                        transpose_quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
+                    else:
+                        a = torch.randn(m, k, dtype=torch.float16, device=model.device)
+                        quant_matmul_inference_only_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
        del kn_values


--- a/auto_gptq/nn_modules/triton_utils/kernels.py
+++ b/auto_gptq/nn_modules/triton_utils/kernels.py
@ -73,27 +73,7 @@ logger = getLogger(__name__)
            },
            num_stages=2,
            num_warps=8
-        ),
-        triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 64,
-                'GROUP_SIZE_M': 8
-            },
-            num_stages=3,
-            num_warps=8
-        ),
-        triton.Config(
-            {
-                'BLOCK_SIZE_M': 32,
-                'BLOCK_SIZE_N': 32,
-                'BLOCK_SIZE_K': 128,
-                'GROUP_SIZE_M': 8
-            },
-            num_stages=2,
-            num_warps=4
-        ),
+        )
    ],
    key=['M', 'N', 'K'],
    nearest_power_of_two=True,
@ -244,27 +224,7 @@ def quant_matmul_248_kernel(
            },
            num_stages=2,
            num_warps=8
-        ),
-        triton.Config(
-            {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 64,
-                'GROUP_SIZE_M': 8
-            },
-            num_stages=3,
-            num_warps=8
-        ),
-        triton.Config(
-            {
-                'BLOCK_SIZE_M': 32,
-                'BLOCK_SIZE_N': 128,
-                'BLOCK_SIZE_K': 32,
-                'GROUP_SIZE_M': 8
-            },
-            num_stages=2,
-            num_warps=4
-        ),
+        )
    ],
    key=['M', 'N', 'K'],
    nearest_power_of_two=True
@ -356,7 +316,6 @@ def silu(x):
    return x * tl.sigmoid(x)


-
 def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
    with torch.cuda.device(input.device):
        output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=input.dtype)
@ -414,3 +373,30 @@ class QuantLinearFunction(torch.autograd.Function):
        if ctx.needs_input_grad[0]:
            grad_input = transpose_quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
        return grad_input, None, None, None, None, None, None
+
+
+def quant_matmul_inference_only_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
+    with torch.cuda.device(input.device):
+        output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16)
+        grid = lambda META: (
+            triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(qweight.shape[1], META['BLOCK_SIZE_N']),
+        )
+        quant_matmul_248_kernel[grid](
+            input, qweight, output,
+            scales, qzeros, g_idx,
+            input.shape[0], qweight.shape[1], input.shape[1],
+            bits, maxq,
+            input.stride(0), input.stride(1),
+            qweight.stride(0), qweight.stride(1),
+            output.stride(0), output.stride(1),
+            scales.stride(0), qzeros.stride(0)
+        )
+        return output
+
+
+class QuantLinearInferenceOnlyFunction(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
+        output = quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq)
+        return output
--- a/auto_gptq/utils/init.py
+++ b/auto_gptq/utils/init.py
@ -0,0 +1 @@
+from .perplexity_utils import Perplexity
--- a/auto_gptq/utils/import_utils.py
+++ b/auto_gptq/utils/import_utils.py
@ -7,15 +7,22 @@ try:
 except ImportError:
    TRITON_AVAILABLE = False

+try:
+    import autogptq_cuda
+
+    AUTOGPTQ_CUDA_AVAILABLE = True
+except:
+    AUTOGPTQ_CUDA_AVAILABLE = False
+

 def dynamically_import_QuantLinear(use_triton: bool, desc_act: bool, group_size: int):
    if use_triton:
-        from ..nn_modules.qlinear_triton import QuantLinear
+        from ..nn_modules.qlinear.qlinear_triton import QuantLinear
    else:
        if not desc_act or group_size == -1:
-            from ..nn_modules.qlinear_old import QuantLinear
+            from ..nn_modules.qlinear.qlinear_cuda_old import QuantLinear
        else:
-            from ..nn_modules.qlinear import QuantLinear
+            from ..nn_modules.qlinear.qlinear_cuda import QuantLinear

    return QuantLinear

--- a/auto_gptq/utils/peft_utils.py
+++ b/auto_gptq/utils/peft_utils.py
@ -0,0 +1,423 @@
+import warnings
+import re
+from contextlib import contextmanager
+from dataclasses import asdict
+from enum import Enum
+from typing import List, Optional
+
+import torch
+from peft import get_peft_model, PeftConfig, PeftModel, PeftType
+from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING
+from peft.tuners.lora import LoraConfig, LoraLayer, LoraModel, Embedding
+from peft.tuners.adalora import AdaLoraConfig, AdaLoraLayer, AdaLoraModel
+from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING
+from peft.utils.other import _get_submodules
+
+from ..modeling._base import BaseGPTQForCausalLM
+
+
+class GPTQLoraConfig(LoraConfig):
+    injected_fused_attention: bool = False
+    injected_fused_mlp: bool = False
+
+
+class GPTQLoraLinear(torch.nn.Linear, LoraLayer):
+    def __init__(
+        self,
+        adapter_name: str,
+        linear_module: torch.nn.Linear,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        **kwargs,
+    ):
+        init_lora_weights = kwargs.pop("init_lora_weights", True)
+
+        torch.nn.Linear.__init__(self, linear_module.in_features, linear_module.out_features)
+        LoraLayer.__init__(self, linear_module.in_features, linear_module.out_features)
+
+        self.linear_module = linear_module
+
+        self.weight.requires_grad = False
+        self.weight = self.linear_module.weight
+        self.bias = self.linear_module.bias
+        self.fan_in_fan_out = fan_in_fan_out
+        if fan_in_fan_out:
+            self.weight.data = self.weight.data.T
+
+        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+        self.active_adapter = adapter_name
+
+    def reset_lora_parameters(self, adapter_name):
+        if adapter_name in self.lora_A.keys():
+            torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight)
+            torch.nn.init.zeros_(self.lora_B[adapter_name].weight)
+
+    def merge(self):
+        raise NotImplementedError("gptq model not support merge lora adapter")
+
+    def unmerge(self):
+        raise NotImplementedError("gptq model not support unmerge lora adapter")
+
+    def forward(self, x: torch.Tensor):
+        previous_dtype = x.dtype
+        if self.active_adapter not in self.lora_A.keys():
+            return self.linear_module(x)
+        if self.disable_adapters:
+            if self.r[self.active_adapter] > 0 and self.merged:
+                self.unmerge()
+            result = self.linear_module(x)
+        elif self.r[self.active_adapter] > 0 and not self.merged:
+            result = self.linear_module(x)
+
+            lora_B = self.lora_B[self.active_adapter]
+            lora_A = self.lora_A[self.active_adapter]
+            lora_dropout = self.lora_dropout[self.active_adapter]
+            scale = self.scaling[self.active_adapter]
+
+            x = x.type_as(lora_A.weight.data)
+            adapter_result = (lora_B(lora_A(lora_dropout(x))) * scale).type_as(result)
+            result += adapter_result
+        else:
+            result = self.linear_module(x)
+
+        result = result.to(previous_dtype)
+
+        return result
+
+
+class GPTQLoraModel(LoraModel):
+    def _find_and_replace(self, adapter_name):
+        lora_config = self.peft_config[adapter_name]
+        is_target_modules_in_base_model = False
+        kwargs = {
+            "r": lora_config.r,
+            "lora_alpha": lora_config.lora_alpha,
+            "lora_dropout": lora_config.lora_dropout,
+            "fan_in_fan_out": lora_config.fan_in_fan_out,
+            "init_lora_weights": lora_config.init_lora_weights,
+        }
+        key_list = [key for key, _ in self.model.named_modules()]
+        for key in key_list:
+            if isinstance(lora_config.target_modules, str):
+                target_module_found = re.fullmatch(lora_config.target_modules, key)
+            else:
+                target_module_found = any(key.endswith(target_key) for target_key in lora_config.target_modules)
+            if target_module_found:
+                if not is_target_modules_in_base_model:
+                    is_target_modules_in_base_model = True
+                parent, target, target_name = _get_submodules(self.model, key)
+                bias = False
+                if hasattr(target, "bias"):
+                    bias = target.bias is not None
+
+                if isinstance(target, LoraLayer):
+                    target.update_layer(
+                        adapter_name,
+                        lora_config.r,
+                        lora_config.lora_alpha,
+                        lora_config.lora_dropout,
+                        lora_config.init_lora_weights,
+                    )
+                else:
+                    if isinstance(target, torch.nn.Embedding):
+                        embedding_kwargs = kwargs.copy()
+                        embedding_kwargs.pop("fan_in_fan_out", None)
+                        in_features, out_features = target.num_embeddings, target.embedding_dim
+                        new_module = Embedding(adapter_name, in_features, out_features, **embedding_kwargs)
+                    else:
+                        if isinstance(target, torch.nn.Linear):
+                            if kwargs["fan_in_fan_out"]:
+                                warnings.warn(
+                                    "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                                    "Setting fan_in_fan_out to False."
+                                )
+                                kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
+                        else:
+                            raise ValueError(
+                                f"Target module {target} is not supported. "
+                                f"Currently, only `torch.nn.Linear` and its subclasses are supported."
+                            )
+                        new_module = GPTQLoraLinear(adapter_name, target, **kwargs)
+
+                    self._replace_module(parent, target_name, new_module, target)
+        if not is_target_modules_in_base_model:
+            raise ValueError(
+                f"Target modules {lora_config.target_modules} not found in the base model. "
+                f"Please check the target modules and try again."
+            )
+
+    def _replace_module(self, parent_module, child_name, new_module, old_module):
+        setattr(parent_module, child_name, new_module)
+        if not isinstance(new_module, GPTQLoraLinear):
+            new_module.weight = old_module.weight
+            if hasattr(old_module, "bias"):
+                if old_module.bias is not None:
+                    new_module.bias = old_module.bias
+
+            if getattr(old_module, "state", None) is not None:
+                new_module.state = old_module.state
+                new_module.to(old_module.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if "lora_" in name:
+                module.to(old_module.weight.device)
+
+    def merge_adapter(self):
+        raise NotImplementedError("gptq model not support merge ada lora adapter")
+
+    def unmerge_adapter(self):
+        raise NotImplementedError("gptq model not support unmerge ada lora adapter")
+
+    def merge_and_unload(self):
+        raise NotImplementedError("gptq model not support merge and unload")
+
+
+class GPTQAdaLoraConfig(AdaLoraConfig):
+    injected_fused_attention: bool = False
+    injected_fused_mlp: bool = False
+
+
+class GPTQSVDLinear(torch.nn.Linear, AdaLoraLayer):
+    def __init__(
+        self,
+        adapter_name: str,
+        linear_module: torch.nn.Linear,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        **kwargs,
+    ):
+        init_lora_weights = kwargs.pop("init_lora_weights", True)
+
+        torch.nn.Linear.__init__(self, linear_module.in_features, linear_module.out_features)
+        AdaLoraLayer.__init__(self, linear_module.in_features, linear_module.out_features)
+
+        self.linear_module = linear_module
+
+        self.weight.requires_grad = False
+        self.weight = self.linear_module.weight
+        self.bias = self.linear_module.bias
+        self.fan_in_fan_out = fan_in_fan_out
+        if fan_in_fan_out:
+            self.weight.data = self.weight.data.T
+
+        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+        self.active_adapter = adapter_name
+
+    def merge(self):
+        raise NotImplementedError("gptq model not support merge lora adapter")
+
+    def unmerge(self):
+        raise NotImplementedError("gptq model not support unmerge lora adapter")
+
+    def forward(self, x: torch.Tensor):
+        if self.active_adapter not in self.lora_A.keys():
+            return self.linear_module(x)
+        if self.disable_adapters:
+            if self.r[self.active_adapter] > 0 and self.merged:
+                self.unmerge()
+            result = self.linear_module(x)
+        elif self.r[self.active_adapter] > 0 and not self.merged:
+            result = self.linear_module(x)
+            result += (
+                (
+                    self.lora_dropout[self.active_adapter](x)
+                    @ (self.lora_A[self.active_adapter] * self.lora_E[self.active_adapter]).T
+                    @ self.lora_B[self.active_adapter].T
+                )
+                * self.scaling[self.active_adapter]
+                / (self.ranknum[self.active_adapter] + 1e-5)
+            )
+        else:
+            result = self.linear_module(x)
+        return result
+
+
+class GPTQAdaLoraModel(AdaLoraModel):
+    def _find_and_replace(self, adapter_name):
+        lora_config = self.peft_config[adapter_name]
+        is_target_modules_in_base_model = False
+        kwargs = {
+            "r": lora_config.init_r,
+            "lora_alpha": lora_config.lora_alpha,
+            "lora_dropout": lora_config.lora_dropout,
+            "fan_in_fan_out": lora_config.fan_in_fan_out,
+            "init_lora_weights": lora_config.init_lora_weights,
+        }
+        key_list = [key for key, _ in self.model.named_modules()]
+        for key in key_list:
+            if isinstance(lora_config.target_modules, str):
+                target_module_found = re.fullmatch(lora_config.target_modules, key)
+            else:
+                target_module_found = any(key.endswith(target_key) for target_key in lora_config.target_modules)
+            if target_module_found:
+                if not is_target_modules_in_base_model:
+                    is_target_modules_in_base_model = True
+                parent, target, target_name = _get_submodules(self.model, key)
+                bias = target.bias is not None
+                if isinstance(target, LoraLayer):
+                    target.update_layer(
+                        adapter_name,
+                        lora_config.init_r,
+                        lora_config.lora_alpha,
+                        lora_config.lora_dropout,
+                        lora_config.init_lora_weights,
+                    )
+                else:
+                    if isinstance(target, torch.nn.Linear):
+                        in_features, out_features = target.in_features, target.out_features
+                        if kwargs["fan_in_fan_out"]:
+                            warnings.warn(
+                                "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                                "Setting fan_in_fan_out to False."
+                            )
+                            kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
+                    else:
+                        raise ValueError(
+                            f"Target module {target} is not supported. "
+                            f"Currently, only `torch.nn.Linear` and its subclasses are supported."
+                        )
+                    new_module = GPTQSVDLinear(adapter_name, target, **kwargs)
+
+                    self._replace_module(parent, target_name, new_module, target)
+        if not is_target_modules_in_base_model:
+            raise ValueError(
+                f"Target modules {lora_config.target_modules} not found in the base model. "
+                f"Please check the target modules and try again."
+            )
+
+    def _replace_module(self, parent_module, child_name, new_module, old_module):
+        setattr(parent_module, child_name, new_module)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if "lora_" in name:
+                module.to(old_module.weight.device)
+
+    def merge_adapter(self):
+        raise NotImplementedError("gptq model not support merge ada lora adapter")
+
+    def unmerge_adapter(self):
+        raise NotImplementedError("gptq model not support unmerge ada lora adapter")
+
+    def merge_and_unload(self):
+        raise NotImplementedError("gptq model not support merge and unload")
+
+
+def find_all_linear_names(model: BaseGPTQForCausalLM, ignore: Optional[List[str]] = None, ignore_lm_head: bool = True):
+    if not ignore:
+        ignore = []
+    lm_head_name = model.lm_head_name
+    if ignore_lm_head and lm_head_name not in ignore:
+        ignore.append(lm_head_name)
+    results = set()
+    for n, m in model.named_modules():
+        if isinstance(m, torch.nn.Linear):
+            res = n.split('.')[-1]
+            if res not in ignore:
+                results.add(res)
+    return list(results)
+
+
+@contextmanager
+def hijack_peft_mappings():
+    PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
+    PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
+    PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
+    PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
+
+    try:
+        yield
+    except:
+        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
+        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
+        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
+        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
+        raise
+    finally:
+        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
+        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
+        PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
+        PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
+
+
+def get_gptq_peft_model(
+    model: BaseGPTQForCausalLM,
+    peft_config: PeftConfig = None,
+    model_id: str = None,
+    adapter_name: str = "default",
+    auto_find_all_linears: bool = True,
+    train_mode: bool = False
+):
+    if train_mode and not model.trainable:
+        model.enable_trainable_mode()
+    if train_mode and not peft_config:
+        raise ValueError("peft_config not specified when in train mode.")
+    if not train_mode and not model_id:
+        raise ValueError("model_id(where to load adapters) not specified when in inference mode.")
+
+    if model.fused_attn_module_type is not None and not model.injected_fused_attention:
+        peft_types = [PeftType.LORA.value, PeftType.ADALORA.value]
+        warnings.warn(
+            f"You can just ignore this warning if the peft type you use isn't in {peft_types}.\n"
+            f"{model.__class__.__name__} supports injecting fused attention but not enables this time. "
+            "If you are training adapters, you must also disable fused attention injection when loading quantized "
+            "base model at inference time, otherwise adapters may not be added to base model properly. "
+            "If you are loading adapters to do inference, you can reference to adapter's config file to check "
+            "whether the adapters are trained using base model that not enable fused attention injection."
+        )
+    if model.injected_fused_mlp:
+        raise NotImplementedError("GPTQ model that enables fused mlp injection is not supported to integrate with peft.")
+
+    if train_mode:
+        peft_type = peft_config.peft_type
+        if not isinstance(peft_type, str):
+            peft_type = peft_type.value
+        if peft_type in [PeftType.LORA.value, PeftType.ADALORA.value]:
+            if auto_find_all_linears:
+                peft_config.target_modules = find_all_linear_names(model, ignore_lm_head=True)
+            if peft_type == PeftType.LORA.value and not isinstance(peft_config, GPTQLoraConfig):
+                peft_config = GPTQLoraConfig(**peft_config.to_dict())
+            if peft_type == PeftType.ADALORA.value and not isinstance(peft_config, GPTQAdaLoraConfig):
+                peft_config = GPTQAdaLoraConfig(**peft_config.to_dict())
+            peft_config.injected_fused_attention = model.injected_fused_attention
+            peft_config.injected_fused_mlp = model.injected_fused_mlp
+        if peft_type == PeftType.ADAPTION_PROMPT.value:
+            if peft_config.adapter_layers > model.config.num_hidden_layers:
+                warnings.warn(
+                    f"model has only {model.config.num_hidden_layers} layers "
+                    f"but adapter_layers is set to {peft_config.adapter_layers}, "
+                    f"will reset value to {model.config.num_hidden_layers}."
+                )
+                peft_config.adapter_layers = model.config.num_hidden_layers
+            if model.injected_fused_attention:
+                raise NotImplementedError(
+                    "model with fused attention injected isn't supported to use ADAPTION_PROMPT peft type yet."
+                )
+
+    with hijack_peft_mappings():
+        try:
+            if train_mode:
+                peft_model = get_peft_model(model.model, peft_config)
+            else:
+                peft_model = PeftModel.from_pretrained(model.model, model_id, adapter_name)
+        except:
+            raise NotImplementedError(
+                f"{model.__class__.__name__} not support {peft_config.peft_type.value} peft type yet."
+            )
+
+    return peft_model
+
+
+__all__ = [
+    "GPTQLoraConfig",
+    "GPTQLoraModel",
+    "GPTQAdaLoraConfig",
+    "GPTQAdaLoraModel",
+    "find_all_linear_names",
+    "get_gptq_peft_model"
+]
--- a/auto_gptq/utils/perplexity_utils.py
+++ b/auto_gptq/utils/perplexity_utils.py
@ -0,0 +1,215 @@
+import sys
+import torch
+import numpy as np
+from tqdm import tqdm
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+
+class Perplexity:
+    """
+    A class for calculating the perplexity of a language model.
+    """
+
+    def __init__(self, model, tokenizer, dataset_path='wikitext', dataset_name=None, split='test', text_column='text'):
+        """
+        Calculate perplexity using the same method as seen in llama.cpp.
+
+        Parameters
+        ----------
+        model : AutoModelForCausalLM
+            The language model for which the perplexity is calculated.
+        tokenizer : AutoTokenizer
+            The tokenizer corresponding to the model.
+        device : str, optional
+            The device to run the calculations on. If auto, the device that your model uses
+            will be the device used for these calculations. Default is 'auto'.
+        dataset_path : str, optional
+            The path to the dataset on the Hugging Face dataset hub. Default is 'wikitext'.
+        dataset_name : str, optional
+            The name of the dataset. Default is None.
+        split : str, optional
+            The split of the dataset to use. Default is 'test'.
+        text_column : str, optional
+            The name of the column in the dataset that contains the text data. Default is 'text'.
+        """
+        self._model = model
+        self._tokenizer = tokenizer
+        self._dataset_path = dataset_path
+        self._dataset_name = dataset_name
+        self._split = split
+        self._text_column = text_column
+        self._text = self._prepare_data()
+    
+    def _get_device(self):
+        if torch.backends.mps.is_available():
+            return 'mps'
+        elif torch.cuda.is_available():
+            return 'cuda:0'
+        else:
+            return 'cpu'
+    
+    def _prepare_data(self):
+        """
+        Prepares the dataset by loading and formatting.
+
+        Returns
+        -------
+        str
+            The formatted dataset as a single string.
+        """
+        if self._dataset_path == 'wikitext':
+            self._dataset_name = 'wikitext-2-raw-v1'
+        
+        # Load the dataset
+        data = load_dataset(self._dataset_path, self._dataset_name, split=self._split)
+        # Format the text column of the dataset
+        text_list = [' \n' if s == '' else s for s in data[self._text_column]]
+        return ''.join(text_list)
+
+    @staticmethod
+    def softmax(logits):
+        """
+        Static method for applying the softmax function.
+
+        Parameters
+        ----------
+        logits : np.ndarray
+            The input to the softmax function.
+
+        Returns
+        -------
+        np.ndarray
+            The output of the softmax function.
+        """
+        e_x = np.exp(logits - np.max(logits))
+        return e_x / e_x.sum(axis=0)
+
+    def calculate_perplexity(self, n_ctx=512, n_batch=512):
+        """
+        Calculates the perplexity of the language model.
+
+        Parameters
+        ----------
+        n_ctx : int
+            The context size.
+        n_batch : int
+            The batch size.
+        
+        Returns
+        -------
+        list
+            The list of perplexity scores calculated.
+        """
+        # Tokenize the text
+        self._tokenizer.model_max_length = sys.maxsize
+        tokens = self._tokenizer(self._text, truncation=False, return_tensors='pt').input_ids.to(self._model.device)
+
+        nll = 0.0  # Negative log likelihood
+        count = 0  # Counter for processed tokens
+        curr_ppl = 0
+        all_perplexity = []
+
+        with tqdm(range(len(tokens[0]) // n_ctx), desc="Perplexity: - ") as progress:
+            for i in progress:
+                # Process each batch of tokens
+                nll, count = self._process_batch(i, n_ctx, n_batch, tokens, nll, count)
+
+                # Calculate and display the current perplexity
+                curr_ppl = np.exp(nll / count)
+                all_perplexity.append(curr_ppl)
+                progress.set_description(f"Perplexity: {curr_ppl:.4f}")
+
+        return all_perplexity
+
+    def _process_batch(self, i, n_ctx, n_batch, tokens, nll, count):
+        """
+        Processes each batch of tokens.
+
+        Parameters
+        ----------
+        i : int
+            The batch index.
+        n_ctx : int
+            The context size.
+        n_batch : int
+            The batch size.
+        tokens : torch.Tensor
+            The tokenized text.
+        nll : float
+            The current negative log likelihood.
+        count : int
+            The current count of processed tokens.
+
+        Returns
+        -------
+        float
+            The updated negative log likelihood.
+        int
+            The updated count of processed tokens.
+        """
+        start = i * n_ctx
+        end = start + n_ctx
+
+        num_batches = (n_ctx + n_batch - 1) // n_batch
+
+        logits = []
+
+        for j in range(num_batches):
+            batch_start = start + j * n_batch
+            batch_size = min(end - batch_start, n_batch)
+
+            token_org = tokens[0][batch_start].item()
+
+            if j == 0:
+                # Replace the first token with the BOS token
+                tokens[0][batch_start] = self._tokenizer.bos_token_id
+
+            # Compute the logits for the current batch of tokens
+            batch_logits = self._compute_batch_logits(tokens, batch_start, batch_size)
+
+            tokens[0][batch_start] = token_org
+
+            logits.append(batch_logits)
+        
+        # We rely on the fact that attention in the forward pass only looks at previous
+        # tokens here, so the logits returned for each token are an accurate representation
+        # of what the model would have predicted at that point.
+        # 
+        # Example, we have a context window of 512, we will compute perplexity for each of the
+        # last 256 tokens.  Then, we split the input up into context window size chunks to
+        # process the entire prompt.
+
+        for j in range(min(512, n_ctx // 2), n_ctx - 1):
+            tok_logits = logits[0][0][j].cpu().numpy()
+            # Compute the probability of the next token
+            prob = self.softmax(tok_logits)[tokens[0][start + j + 1]]
+
+            # Update the negative log likelihood and the count of processed tokens
+            nll += -np.log(prob, where=prob>0)
+            count += 1
+
+        return nll, count
+
+    def _compute_batch_logits(self, tokens, batch_start, batch_size):
+        """
+        Computes the logits for a batch of tokens.
+
+        Parameters
+        ----------
+        tokens : torch.Tensor
+            The tokenized text.
+        batch_start : int
+            The start index of the batch.
+        batch_size : int
+            The size of the batch.
+
+        Returns
+        -------
+        torch.Tensor
+            The logits for the batch of tokens.
+        """
+        # Compute the logits without keeping track of gradients
+        with torch.no_grad():
+            outputs = self._model(tokens[:, batch_start:batch_start+batch_size])
+        return outputs.logits.detach()
--- a/autogptq_cuda/autogptq_cuda_256.cpp
+++ b/autogptq_cuda/autogptq_cuda_256.cpp
@ -172,16 +172,16 @@ void vecquant4matmul_faster_old(


 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
-  m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
-  m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
-  m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
  
  m.def("vecquant2matmul_old", &vecquant2matmul_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
  m.def("vecquant3matmul_old", &vecquant3matmul_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
  m.def("vecquant4matmul_old", &vecquant4matmul_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
  m.def("vecquant8matmul_old", &vecquant8matmul_old, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
-  m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
+  m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA), faster version");
  m.def("vecquant3matmul_faster_old", &vecquant3matmul_faster_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA), faster version");
  m.def("vecquant4matmul_faster_old", &vecquant4matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
 }
--- a/autogptq_cuda/autogptq_cuda_64.cpp
+++ b/autogptq_cuda/autogptq_cuda_64.cpp
@ -0,0 +1,187 @@
+#include <torch/all.h>
+#include <torch/python.h>
+#include <c10/cuda/CUDAGuard.h>
+
+void vecquant2matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+
+void vecquant2matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant2matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+
+void vecquant3matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+
+void vecquant3matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant3matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+
+void vecquant4matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+
+void vecquant4matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+
+void vecquant8matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+
+void vecquant8matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+
+
+// old
+
+void vecquant2matmul_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+); 
+
+void vecquant2matmul_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant2matmul_cuda_old(vec, mat, mul, scales, zeros,groupsize);
+}
+
+void vecquant3matmul_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+); 
+
+void vecquant3matmul_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant3matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
+}
+
+void vecquant4matmul_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+); 
+
+void vecquant4matmul_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
+}
+
+void vecquant8matmul_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+); 
+
+void vecquant8matmul_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
+}
+
+void vecquant2matmul_faster_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+); 
+
+void vecquant2matmul_faster_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant2matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
+}
+
+void vecquant3matmul_faster_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+); 
+
+void vecquant3matmul_faster_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant3matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
+}
+
+void vecquant4matmul_faster_cuda_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+); 
+
+void vecquant4matmul_faster_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  int groupsize, int vec_height
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  
+  m.def("vecquant2matmul_old", &vecquant2matmul_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant3matmul_old", &vecquant3matmul_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant4matmul_old", &vecquant4matmul_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant8matmul_old", &vecquant8matmul_old, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
+  m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA), faster version");
+  m.def("vecquant3matmul_faster_old", &vecquant3matmul_faster_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA), faster version");
+  m.def("vecquant4matmul_faster_old", &vecquant4matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
+}
--- a/autogptq_cuda/autogptq_cuda_kernel_256.cu
+++ b/autogptq_cuda/autogptq_cuda_kernel_256.cu
@ -7,29 +7,66 @@
 // atomicAdd for double-precision floating-point numbers on hardware with
 // compute capability < 6.0 from:
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-__device__ double atomicAdd(
-    double* address,
-    double val
-) {
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old = *address_as_ull, assumed;
+// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+// __device__ double atomicAdd(
+//     double* address,
+//     double val
+// ) {
+//   unsigned long long int* address_as_ull = (unsigned long long int*)address;
+//   unsigned long long int old = *address_as_ull, assumed;
+//
+//   do {
+//     assumed = old;
+//     old = atomicCAS(
+//       address_as_ull,
+//       assumed,
+//       __double_as_longlong(val + __longlong_as_double(assumed))
+//     );
+//
+//   // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+//   } while (assumed != old);
+//
+//   return __longlong_as_double(old);
+// }
+// #endif
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
+// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
+__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
+    unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;

    do {
        assumed = old;
-    old = atomicCAS(
-      address_as_ull,
-      assumed,
-      __double_as_longlong(val + __longlong_as_double(assumed))
-    );
+        unsigned short hsum = reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
+        hsum += val;
+        old = reinterpret_cast<size_t>(address) & 2
+                 ? (old & 0xffff) | (hsum << 16)
+                 : (old & 0xffff0000) | hsum;
+        old = atomicCAS(address_as_ui, assumed, old);

    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
    } while (assumed != old);
+}
+__device__ __forceinline__ void atomicAdd(__half* address, c10::Half val) {
+    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;

-  return __longlong_as_double(old);
+    do {
+        assumed = old;
+        __half_raw hsum;
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        half tmpres = __hadd(hsum, val);
+        hsum = __half_raw(tmpres);
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+        old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
 }
 #endif

+
 template <typename scalar_t>
 __global__ void VecQuant2MatMulKernel(
    const  scalar_t* __restrict__ vec,
--- a/autogptq_cuda/autogptq_cuda_kernel_64.cu
+++ b/autogptq_cuda/autogptq_cuda_kernel_64.cu
--- a/docs/NEWS_OR_UPDATE.md
+++ b/docs/NEWS_OR_UPDATE.md
@ -1,4 +1,6 @@
 ## <center>News or Update</center>
+- 2023-07-26 - (Update) - An elegant [PPL benchmark script](examples/benchmark/perplexity.py) to get results that can be fairly compared with other libraries such as `llama.cpp`.
+- 2023-06-05 - (Update) - Integrate with 🤗 peft to use gptq quantized model to train adapters, support LoRA, AdaLoRA, AdaptionPrompt, etc.
 - 2023-05-30 - (Update) - support download/upload quantized model from/to 🤗 Hub.
 - 2023-05-27 - (Update) - Support quantization and inference for `gpt_bigcode`, `codegen` and `RefineWeb/RefineWebModel`(falcon) model types.
 - 2023-05-04 - (Update) - Support using faster cuda kernel when `not desc_act or group_size == -1`
--- a/examples/README.md
+++ b/examples/README.md
@ -13,9 +13,9 @@ python basic_usage.py

 This script also showcases how to download/upload quantized model from/to 🤗 Hub, to enable those features, you can uncomment the commented codes.

-To Execute `basic_usage_with_wikitext2.py`, using command like this:
+To Execute `basic_usage_wikitext2.py`, using command like this:
 ```shell
-python basic_usage_with_wikitext2.py
+python basic_usage_wikitext2.py
 ```
 > Note: There is about 0.6 ppl degrade on opt-125m model using AutoGPTQ, compared to GPTQ-for-LLaMa.

@ -66,11 +66,48 @@ Use `--help` flag to see detailed descriptions for more command arguments.
 > Commands in this chapter should be run under `benchmark` folder.

 ### Generation Speed
-`generation_speed.py` scripts gives an example of how to benchmark the generations speed of pretrained and quantized models that `auto_gptq` supports, this benchmarks model generation speed in tokens/s metric.
+`generation_speed.py` script gives an example of how to benchmark the generations speed of pretrained and quantized models that `auto_gptq` supports, this benchmarks model generation speed in tokens/s metric.

-To eexcute this script, using command like this:
+To execute this script, using command like this:
 ```shell
 CUDA_VISIBLE_DEVICES=0 python generation_speed.py --model_name_pr_path PATH/TO/MODEL/DIR
 ```

 Use `--help` flag to see detailed descriptions for more command arguments.
+
+## PEFT
+> Commands in this chapter should be run under `peft` folder.
+
+### Lora
+`peft_lora_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's lora adapter using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
+
+To execute this script, using command like this:
+```shell
+CUDA_VISIBLE_DEVICES=0 python peft_lora_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
+```
+
+Use `--help` flag to see detailed descriptions for more command arguments.
+
+### AdaLora
+`peft_adalora_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's adalora adapter using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
+
+To execute this script, using command like this:
+```shell
+CUDA_VISIBLE_DEVICES=0 python peft_adalora_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
+```
+
+Use `--help` flag to see detailed descriptions for more command arguments.
+
+
+### AdaptionPrompt
+`peft_adaption_prompt_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's adaption_prompt adapter(llama-adapter) using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
+
+To execute this script, using command like this:
+```shell
+CUDA_VISIBLE_DEVICES=0 python peft_adaption_prompt_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
+```
+
+Use `--help` flag to see detailed descriptions for more command arguments.
+
+If you want to try models other than llama, you can install peft from source using [this branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt), see [here](https://github.com/PanQiWei/peft/blob/a5f8f74f07591efe5eb3d08cb1b31b981e84a069/src/peft/tuners/adaption_prompt.py#L235) 
+to check what other models are also supported, and with this branch installed, you can also use `ADAPTION_PROMPT_V2` peft type (llama-adapter-v2) by simply replace `AdaptionPromptConfig` with `AdaptionPromptV2Config` in the script.
--- a/examples/benchmark/dataset/alpaca_data_cleaned.json
+++ b/examples/benchmark/dataset/alpaca_data_cleaned.json
--- a/examples/benchmark/generation_speed.py
+++ b/examples/benchmark/generation_speed.py
@ -144,7 +144,9 @@ def load_model_tokenizer(
    trust_remote_code: bool = False,
    use_triton: bool = False,
    use_safetensors: bool = False,
-    use_fast_tokenizer: bool = False
+    use_fast_tokenizer: bool = False,
+    inject_fused_attention: bool = True,
+    inject_fused_mlp: bool = True
 ):
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path=tokenizer_name_or_path or model_name_or_path,
@ -163,12 +165,12 @@ def load_model_tokenizer(
        )
    else:
        model = AutoGPTQForCausalLM.from_quantized(
-            save_dir=model_name_or_path,
+            model_name_or_path,
            max_memory=max_memory,
            low_cpu_mem_usage=True,
            use_triton=use_triton,
-            inject_fused_attention=True,
-            inject_fused_mlp=True,
+            inject_fused_attention=inject_fused_attention,
+            inject_fused_mlp=inject_fused_mlp,
            use_cuda_fp16=True,
            quantize_config=quantize_config,
            model_basename=model_basename,
@ -232,6 +234,8 @@ def main():
    parser.add_argument("--use_triton", action="store_true")
    parser.add_argument("--use_safetensors", action="store_true")
    parser.add_argument("--use_fast_tokenizer", action="store_true")
+    parser.add_argument("--no_inject_fused_attention", action="store_true")
+    parser.add_argument("--no_inject_fused_mlp", action="store_true")
    parser.add_argument("--num_samples", type=int, default=10)
    parser.add_argument("--per_gpu_max_memory", type=int, default=None)
    parser.add_argument("--cpu_max_memory", type=int, default=None)
@ -269,7 +273,9 @@ def main():
        trust_remote_code=args.trust_remote_code,
        use_triton=args.use_triton,
        use_safetensors=args.use_safetensors,
-        use_fast_tokenizer=args.use_fast_tokenizer
+        use_fast_tokenizer=args.use_fast_tokenizer,
+        inject_fused_attention=not args.no_inject_fused_attention,
+        inject_fused_mlp=not args.no_inject_fused_mlp
    )
    end = time.time()
    logger.info(f"model and tokenizer loading time: {end - start:.4f}s")
@ -282,7 +288,9 @@ def main():
        model.warmup_triton()

    logger.info("loading data")
-    examples = load_data("dataset/alpaca_data_cleaned.json", tokenizer, args.num_samples, args.max_new_tokens)
+    examples = load_data(
+        "../quantization/dataset/alpaca_data_cleaned.json", tokenizer, args.num_samples, args.max_new_tokens
+    )

    generation_config = GenerationConfig(
        num_beams=args.num_beams,
--- a/examples/benchmark/perplexity.py
+++ b/examples/benchmark/perplexity.py
@ -0,0 +1,86 @@
+import os
+import argparse
+
+import torch
+from auto_gptq.utils import Perplexity
+from transformers import AutoTokenizer
+
+if __name__ == "__main__":
+    """
+    Example usage.
+
+    Default usage with GPT2 model:
+    python examples/benchmark/perplexity.py
+
+    Specify GPTQ quantized model:
+    python examples/benchmark/perplexity.py \
+        --model_name TheBloke/open-llama-7b-open-instruct-GPTQ \
+        --model_basename gptq_model-4bit-128g \
+        --is_quantized
+    
+    Change your dataset:
+    python examples/benchmark/perplexity.py --dataset_path tiny_shakespeare
+
+    """
+    parser = argparse.ArgumentParser(description="Calculate Perplexity for a model.")
+    parser.add_argument("--model_name", type=str, default='gpt2', help="Model name.")
+    parser.add_argument("--model_basename", type=str, default=None, help="Model file's basename.")
+    parser.add_argument("--n_ctx", type=int, default=512, help="Context size.")
+    parser.add_argument("--n_batch", type=int, default=512, help="Batch size.")
+    parser.add_argument("--dataset_path", type=str, default='wikitext', help="Path to the dataset.")
+    parser.add_argument("--dataset_name", type=str, default=None, help="Name of the dataset.")
+    parser.add_argument("--split", type=str, default='test', help="Dataset split to use.")
+    parser.add_argument("--text_column", type=str, default='text', help="Column in the dataset containing the text.")
+    parser.add_argument("--per_gpu_max_memory", type=int, default=None, help="Max memory used in each GPU.")
+    parser.add_argument("--cpu_max_memory", type=int, default=None, help="Mx memory used in CPU.")
+    parser.add_argument("--is_quantized", action="store_true", help="Is the model GPTQ quantized?")
+    parser.add_argument("--use_safetensors", action="store_true", help="Whether to use safetensors model file")
+    parser.add_argument("--use_fast_tokenizer", action="store_true", help="Wheter to use fast tokenizer")
+    parser.add_argument("--trust_remote_code", action="store_true", help="Whether to use remote code")
+    args = parser.parse_args()
+
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=args.use_fast_tokenizer)
+    if not tokenizer.pad_token_id:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+    max_memory = dict()
+    if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0:
+        if torch.cuda.is_available():
+            max_memory.update(
+                {i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())}
+            )
+    if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory:
+        max_memory["cpu"] = f"{args.cpu_max_memory}GIB"
+    if not max_memory:
+        max_memory = None
+
+    if args.is_quantized:
+        from auto_gptq import AutoGPTQForCausalLM
+
+        model = AutoGPTQForCausalLM.from_quantized(
+            args.model_name,
+            low_cpu_mem_usage=True,
+            device_map="auto",
+            max_memory=max_memory,
+            model_basename=args.model_basename,
+            use_safetensors=args.use_safetensors,
+            trust_remote_code=args.trust_remote_code,
+            inject_fused_mlp=False,
+            inject_fused_attention=False
+        )
+    else:
+        from transformers import AutoModelForCausalLM
+
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name,
+            low_cpu_mem_usage=True,
+            device_map="auto",
+            max_memory=max_memory,
+            torch_dtype=torch.float16,
+            trust_remote_code=args.trust_remote_code
+        )
+
+    ppl = Perplexity(model, tokenizer, args.dataset_path, args.dataset_name, args.split, args.text_column)
+    ppl.calculate_perplexity(args.n_ctx, args.n_batch)
--- a/examples/peft/peft_adalora_clm_instruction_tuning.py
+++ b/examples/peft/peft_adalora_clm_instruction_tuning.py
@ -0,0 +1,169 @@
+import json
+import os
+from argparse import ArgumentParser
+from functools import partial
+
+import torch
+from datasets import Dataset
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoTokenizer, get_linear_schedule_with_warmup
+
+from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
+from auto_gptq.utils.data_utils import make_data_block, collate_data
+from auto_gptq.utils.peft_utils import GPTQAdaLoraConfig
+from peft import TaskType
+
+parser = ArgumentParser()
+parser.add_argument("--model_name_or_path", type=str)
+parser.add_argument("--lr", type=float, default=3e-3)
+parser.add_argument("--num_epochs", type=int, default=1)
+parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
+parser.add_argument("--block_max_length", type=int, default=1024, help="max length of data block(bunch of samples)")
+parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
+parser.add_argument("--use_fast_tokenizer", action="store_true")
+args = parser.parse_args()
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+model_name_or_path = args.model_name_or_path
+tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
+
+lr = args.lr
+num_epochs = args.num_epochs
+
+# creating model
+peft_config = GPTQAdaLoraConfig(
+    init_r=20,
+    target_r=16,
+    beta1=0.85,
+    beta2=0.85,
+    tinit=200,
+    tfinal=1000,
+    deltaT=10,
+    lora_alpha=32,
+    lora_dropout=0.1,
+    task_type=TaskType.CAUSAL_LM,
+    inference_mode=False,
+)
+
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
+if not tokenizer.pad_token_id:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+
+model = AutoGPTQForCausalLM.from_quantized(
+    model_name_or_path,
+    use_triton=True,
+    warmup_triton=False,
+    trainable=True,
+    inject_fused_attention=True,
+    inject_fused_mlp=False
+)
+model.warmup_triton()
+device = model.device
+model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
+model.print_trainable_parameters()
+
+# loading dataset
+WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
+WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
+
+
+def ds_refactor_fn(samples):
+    instruction_data = samples["instruction"]
+    input_data = samples["input"]
+    output_data = samples["output"]
+
+    new_samples = {"prompt": [], "output": []}
+    for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
+        if input_txt:
+            prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
+        else:
+            prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
+        new_samples["prompt"].append(prompt)
+        new_samples["output"].append(output_txt)
+
+    return new_samples
+
+
+ds = Dataset.from_generator(
+    lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
+)
+ds = ds.map(
+    make_data_block,
+    batched=True,
+    batch_size=len(ds),
+    num_proc=1,
+    remove_columns=ds.column_names,
+    keep_in_memory=True,
+    load_from_cache_file=False,
+    fn_kwargs={
+        "prompt_col_name": "prompt",
+        "label_col_name": "output",
+        "tokenizer": tokenizer,
+        "preprocess_fn": ds_refactor_fn,
+        "sample_max_len": args.sample_max_length,
+        "block_max_len": args.block_max_length,
+        "add_eos_token": True,
+        "truncate_prompt": False,
+        "merge_prompt_label": True
+    }
+)
+ds = ds.train_test_split(test_size=len(ds) // 10)
+train_ds, eval_ds = ds["train"], ds["test"]
+collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
+train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
+eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
+
+# optimizer and lr scheduler
+optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+lr_scheduler = get_linear_schedule_with_warmup(
+    optimizer=optimizer,
+    num_warmup_steps=0,
+    num_training_steps=(len(train_dataloader) * num_epochs),
+)
+model.base_model.peft_config["default"].total_step = len(train_dataloader) * num_epochs
+
+# training and evaluation
+with torch.cuda.amp.autocast():
+    global_step = 0
+    for epoch in range(num_epochs):
+        model.train()
+        total_loss = 0
+        progress_bar = tqdm(train_dataloader)
+        for step, batch in enumerate(progress_bar):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs.loss
+            total_loss += loss.detach().float()
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+            # Update the importance of low-rank matrices
+            # and allocate the budget accordingly.
+            model.base_model.update_and_allocate(global_step)
+            optimizer.zero_grad()
+            global_step += 1
+
+            progress_bar.set_postfix(loss=loss.item())
+
+        model.eval()
+        eval_loss = 0
+        eval_preds = []
+        for step, batch in enumerate(tqdm(eval_dataloader)):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            with torch.no_grad():
+                outputs = model(**batch)
+            loss = outputs.loss
+            eval_loss += loss.detach().float()
+            eval_preds.extend(
+                tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
+            )
+
+        eval_epoch_loss = eval_loss / len(eval_dataloader)
+        eval_ppl = torch.exp(eval_epoch_loss)
+        train_epoch_loss = total_loss / len(train_dataloader)
+        train_ppl = torch.exp(train_epoch_loss)
+        print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
+
+model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))
--- a/examples/peft/peft_adaption_prompt_clm_instruction_tuning.py
+++ b/examples/peft/peft_adaption_prompt_clm_instruction_tuning.py
@ -0,0 +1,158 @@
+import json
+import os
+from argparse import ArgumentParser
+from functools import partial
+
+import torch
+from datasets import Dataset
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoTokenizer, get_linear_schedule_with_warmup
+
+from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
+from auto_gptq.utils.data_utils import make_data_block, collate_data
+from peft import TaskType, AdaptionPromptConfig
+
+parser = ArgumentParser()
+parser.add_argument("--model_name_or_path", type=str)
+parser.add_argument("--adapter_len", type=int, default=10)
+parser.add_argument("--adapter_layers", type=int, default=30)
+parser.add_argument("--lr", type=float, default=3e-3)
+parser.add_argument("--num_epochs", type=int, default=1)
+parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
+parser.add_argument("--block_max_length", type=int, default=1024, help="max length of data block(bunch of samples)")
+parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
+parser.add_argument("--use_fast_tokenizer", action="store_true")
+args = parser.parse_args()
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+model_name_or_path = args.model_name_or_path
+tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
+
+lr = args.lr
+num_epochs = args.num_epochs
+
+# creating model
+peft_config = AdaptionPromptConfig(
+    adapter_len=args.adapter_len,
+    adapter_layers=args.adapter_layers,
+    task_type=TaskType.CAUSAL_LM,
+    inference_mode=False,
+)
+
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
+if not tokenizer.pad_token_id:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+
+model = AutoGPTQForCausalLM.from_quantized(
+    model_name_or_path,
+    use_triton=True,
+    warmup_triton=False,
+    trainable=True,
+    inject_fused_attention=False,
+    inject_fused_mlp=False
+)
+model.warmup_triton()
+device = model.device
+model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
+model.print_trainable_parameters()
+
+# loading dataset
+WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
+WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
+
+
+def ds_refactor_fn(samples):
+    instruction_data = samples["instruction"]
+    input_data = samples["input"]
+    output_data = samples["output"]
+
+    new_samples = {"prompt": [], "output": []}
+    for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
+        if input_txt:
+            prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
+        else:
+            prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
+        new_samples["prompt"].append(prompt)
+        new_samples["output"].append(output_txt)
+
+    return new_samples
+
+
+ds = Dataset.from_generator(
+    lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
+)
+ds = ds.map(
+    make_data_block,
+    batched=True,
+    batch_size=len(ds),
+    num_proc=1,
+    remove_columns=ds.column_names,
+    keep_in_memory=True,
+    load_from_cache_file=False,
+    fn_kwargs={
+        "prompt_col_name": "prompt",
+        "label_col_name": "output",
+        "tokenizer": tokenizer,
+        "preprocess_fn": ds_refactor_fn,
+        "sample_max_len": args.sample_max_length,
+        "block_max_len": args.block_max_length,
+        "add_eos_token": True,
+        "truncate_prompt": False,
+        "merge_prompt_label": True
+    }
+)
+ds = ds.train_test_split(test_size=len(ds) // 10)
+train_ds, eval_ds = ds["train"], ds["test"]
+collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
+train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
+eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
+
+# optimizer and lr scheduler
+optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+lr_scheduler = get_linear_schedule_with_warmup(
+    optimizer=optimizer,
+    num_warmup_steps=0,
+    num_training_steps=(len(train_dataloader) * num_epochs),
+)
+
+# training and evaluation
+with torch.cuda.amp.autocast():
+    for epoch in range(num_epochs):
+        model.train()
+        total_loss = 0
+        progress_bar = tqdm(train_dataloader)
+        for step, batch in enumerate(progress_bar):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs.loss
+            total_loss += loss.detach().float()
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+
+            optimizer.zero_grad()
+
+            progress_bar.set_postfix(loss=loss.item())
+
+        model.eval()
+        eval_loss = 0
+        eval_preds = []
+        for step, batch in enumerate(tqdm(eval_dataloader)):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            with torch.no_grad():
+                outputs = model(**batch)
+            loss = outputs.loss
+            eval_loss += loss.detach().float()
+            eval_preds.extend(
+                tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
+            )
+
+        eval_epoch_loss = eval_loss / len(eval_dataloader)
+        eval_ppl = torch.exp(eval_epoch_loss)
+        train_epoch_loss = total_loss / len(train_dataloader)
+        train_ppl = torch.exp(train_epoch_loss)
+        print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
+
+model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))
--- a/examples/peft/peft_lora_clm_instruction_tuning.py
+++ b/examples/peft/peft_lora_clm_instruction_tuning.py
@ -0,0 +1,158 @@
+import json
+import os
+from argparse import ArgumentParser
+from functools import partial
+
+import torch
+from datasets import Dataset
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoTokenizer, get_linear_schedule_with_warmup
+
+from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
+from auto_gptq.utils.data_utils import make_data_block, collate_data
+from auto_gptq.utils.peft_utils import GPTQLoraConfig
+from peft import TaskType
+
+parser = ArgumentParser()
+parser.add_argument("--model_name_or_path", type=str)
+parser.add_argument("--lr", type=float, default=3e-5)
+parser.add_argument("--num_epochs", type=int, default=1)
+parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
+parser.add_argument("--block_max_length", type=int, default=1024, help="max length of data block(bunch of samples)")
+parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
+parser.add_argument("--use_fast_tokenizer", action="store_true")
+args = parser.parse_args()
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+model_name_or_path = args.model_name_or_path
+tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
+
+lr = args.lr
+num_epochs = args.num_epochs
+
+# creating model
+peft_config = GPTQLoraConfig(
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.1,
+    task_type=TaskType.CAUSAL_LM,
+    inference_mode=False,
+)
+
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
+if not tokenizer.pad_token_id:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+
+model = AutoGPTQForCausalLM.from_quantized(
+    model_name_or_path,
+    use_triton=True,
+    warmup_triton=False,
+    trainable=True,
+    inject_fused_attention=True,
+    inject_fused_mlp=False
+)
+model.warmup_triton()
+device = model.device
+model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
+model.print_trainable_parameters()
+
+# loading dataset
+WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
+WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
+
+
+def ds_refactor_fn(samples):
+    instruction_data = samples["instruction"]
+    input_data = samples["input"]
+    output_data = samples["output"]
+
+    new_samples = {"prompt": [], "output": []}
+    for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
+        if input_txt:
+            prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
+        else:
+            prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
+        new_samples["prompt"].append(prompt)
+        new_samples["output"].append(output_txt)
+
+    return new_samples
+
+
+ds = Dataset.from_generator(
+    lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
+)
+ds = ds.map(
+    make_data_block,
+    batched=True,
+    batch_size=len(ds),
+    num_proc=1,
+    remove_columns=ds.column_names,
+    keep_in_memory=True,
+    load_from_cache_file=False,
+    fn_kwargs={
+        "prompt_col_name": "prompt",
+        "label_col_name": "output",
+        "tokenizer": tokenizer,
+        "preprocess_fn": ds_refactor_fn,
+        "sample_max_len": args.sample_max_length,
+        "block_max_len": args.block_max_length,
+        "add_eos_token": True,
+        "truncate_prompt": False,
+        "merge_prompt_label": True
+    }
+)
+ds = ds.train_test_split(test_size=len(ds) // 10)
+train_ds, eval_ds = ds["train"], ds["test"]
+collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
+train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
+eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
+
+# optimizer and lr scheduler
+optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+lr_scheduler = get_linear_schedule_with_warmup(
+    optimizer=optimizer,
+    num_warmup_steps=0,
+    num_training_steps=(len(train_dataloader) * num_epochs),
+)
+
+# training and evaluation
+with torch.cuda.amp.autocast():
+    for epoch in range(num_epochs):
+        model.train()
+        total_loss = 0
+        progress_bar = tqdm(train_dataloader)
+        for step, batch in enumerate(progress_bar):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs.loss
+            total_loss += loss.detach().float()
+            loss.backward()
+            optimizer.step()
+            lr_scheduler.step()
+
+            optimizer.zero_grad()
+
+            progress_bar.set_postfix(loss=loss.item())
+
+        model.eval()
+        eval_loss = 0
+        eval_preds = []
+        for step, batch in enumerate(tqdm(eval_dataloader)):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            with torch.no_grad():
+                outputs = model(**batch)
+            loss = outputs.loss
+            eval_loss += loss.detach().float()
+            eval_preds.extend(
+                tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
+            )
+
+        eval_epoch_loss = eval_loss / len(eval_dataloader)
+        eval_ppl = torch.exp(eval_epoch_loss)
+        train_epoch_loss = total_loss / len(train_dataloader)
+        train_ppl = torch.exp(train_epoch_loss)
+        print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
+
+model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))
--- a/setup.py
+++ b/setup.py
@ -4,29 +4,30 @@ import sys
 from pathlib import Path
 from setuptools import setup, find_packages

-try:
-    import torch
-    TORCH_AVAILABLE = True
-except ImportError:
-    TORCH_AVAILABLE = False
-
-IN_GITHUB_ACTIONS = os.environ.get("GITHUB_ACTIONS", "false") == "true"
-
 python_min_version = (3, 8, 0)
 python_min_version_str = '.'.join(map(str, python_min_version))
 if sys.version_info < python_min_version:
    print(f"You are using Python {platform.python_version()}. Python >={python_min_version_str} is required.")
    sys.exit(-1)

-CUDA_VERSION = "".join(os.environ.get("CUDA_VERSION", "").split("."))
+BUILD_CUDA_EXT = int(os.environ.get('BUILD_CUDA_EXT', '1')) == 1
+
+if BUILD_CUDA_EXT:
+    try:
+        import torch
+    except:
+        print("torch is not installed, please install torch first!")
+        sys.exit(-1)
+    CUDA_VERSION = "".join(torch.version.cuda.split("."))
+else:
+    CUDA_VERSION = "".join(os.environ.get("CUDA_VERSION", "").split("."))

-version = "0.2.1" + (f"+cu{CUDA_VERSION}" if CUDA_VERSION and IN_GITHUB_ACTIONS else "")
 common_setup_kwargs = {
-    "version": version,
+    "version": "0.3.2",
    "name": "auto_gptq",
    "author": "PanQiWei",
    "description": "An easy-to-use LLMs quantization package with user-friendly apis, based on GPTQ algorithm.",
-    "long_description": (Path(__file__).parent / "README.md").read_text(),
+    "long_description": (Path(__file__).parent / "README.md").read_text(encoding="UTF-8"),
    "long_description_content_type": "text/markdown",
    "url": "https://github.com/PanQiWei/AutoGPTQ",
    "keywords": ["gptq", "quantization", "large-language-models", "pytorch", "transformers"],
@ -45,6 +46,9 @@ common_setup_kwargs = {
    "python_requires": f">={python_min_version_str}"
 }

+if CUDA_VERSION:
+    common_setup_kwargs['version'] += f"+cu{CUDA_VERSION}"
+
 requirements = [
    "accelerate>=0.19.0",
    "datasets",
@ -52,33 +56,37 @@ requirements = [
    "rouge",
    "torch>=1.13.0",
    "safetensors",
-    "transformers>=4.26.1"
+    "transformers>=4.31.0",
+    "peft"
 ]

 extras_require = {
-    "llama": ["transformers>=4.28.0"],
    "triton": ["triton>=2.0.0"]
 }

 include_dirs = ["autogptq_cuda"]

-if TORCH_AVAILABLE:
-    BUILD_CUDA_EXT = int(os.environ.get('BUILD_CUDA_EXT', '1')) == 1
-    
-    additional_setup_kwargs = dict()
-    if BUILD_CUDA_EXT and (torch.cuda.is_available() or IN_GITHUB_ACTIONS):
+additional_setup_kwargs = dict()
+if BUILD_CUDA_EXT:
    from torch.utils import cpp_extension
    from distutils.sysconfig import get_python_lib
-        conda_cuda_include_dir=os.path.join(get_python_lib(),"nvidia/cuda_runtime/include")
+    conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include")
    if os.path.isdir(conda_cuda_include_dir):
        include_dirs.append(conda_cuda_include_dir)
        print(f"appending conda cuda include dir {conda_cuda_include_dir}")
    extensions = [
        cpp_extension.CUDAExtension(
-                "autogptq_cuda",
+            "autogptq_cuda_64",
            [
-                    "autogptq_cuda/autogptq_cuda.cpp",
-                    "autogptq_cuda/autogptq_cuda_kernel.cu"
+                "autogptq_cuda/autogptq_cuda_64.cpp",
+                "autogptq_cuda/autogptq_cuda_kernel_64.cu"
+            ]
+        ),
+        cpp_extension.CUDAExtension(
+            "autogptq_cuda_256",
+            [
+                "autogptq_cuda/autogptq_cuda_256.cpp",
+                "autogptq_cuda/autogptq_cuda_kernel_256.cu"
            ]
        )
    ]
@ -87,19 +95,11 @@ if TORCH_AVAILABLE:
        "ext_modules": extensions,
        "cmdclass": {'build_ext': cpp_extension.BuildExtension}
    }
-    common_setup_kwargs.update(additional_setup_kwargs)
-    setup(
+common_setup_kwargs.update(additional_setup_kwargs)
+setup(
    packages=find_packages(),
    install_requires=requirements,
    extras_require=extras_require,
    include_dirs=include_dirs,
    **common_setup_kwargs
-    )
-else:
-    setup(
-        packages=find_packages(),
-        install_requires=requirements,
-        extras_require=extras_require,
-        include_dirs=include_dirs,
-        **common_setup_kwargs
-    )
+)