Merge branch 'main' into MPT
# Conflicts: # auto_gptq/modeling/__init__.py # auto_gptq/modeling/_const.py # auto_gptq/modeling/auto.py
This commit is contained in:
commit
6ff6bc8dfc
37 changed files with 3503 additions and 258905 deletions
|
@ -1,4 +1,4 @@
|
||||||
name: Build AutoGPTQ Wheels
|
name: Build AutoGPTQ Wheels with CUDA
|
||||||
|
|
||||||
on: workflow_dispatch
|
on: workflow_dispatch
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ jobs:
|
||||||
if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH}
|
if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH}
|
||||||
$env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6+PTX'
|
$env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6+PTX'
|
||||||
if ([decimal]$env:CUDA_VERSION -ge 11.8) { $env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
|
if ([decimal]$env:CUDA_VERSION -ge 11.8) { $env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
|
||||||
python -m build -n
|
python setup.py sdist bdist_wheel
|
||||||
|
|
||||||
- uses: actions/upload-artifact@v3
|
- uses: actions/upload-artifact@v3
|
||||||
if: runner.os == 'Linux'
|
if: runner.os == 'Linux'
|
||||||
|
@ -64,37 +64,3 @@ jobs:
|
||||||
with:
|
with:
|
||||||
name: 'windows-wheels'
|
name: 'windows-wheels'
|
||||||
path: ./dist/*.whl
|
path: ./dist/*.whl
|
||||||
|
|
||||||
build_sdist:
|
|
||||||
name: Build source distribution
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: pwsh
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
with:
|
|
||||||
ref: 'main'
|
|
||||||
|
|
||||||
- uses: actions/setup-python@v3
|
|
||||||
with:
|
|
||||||
python-version: "3.10"
|
|
||||||
|
|
||||||
- name: Install Dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade build setuptools wheel
|
|
||||||
|
|
||||||
- name: Build Wheel
|
|
||||||
run: |
|
|
||||||
python -m build -n
|
|
||||||
|
|
||||||
- uses: actions/upload-artifact@v3
|
|
||||||
with:
|
|
||||||
name: 'sdist'
|
|
||||||
path: ./dist/*.tar.gz
|
|
||||||
|
|
||||||
- uses: actions/upload-artifact@v3
|
|
||||||
with:
|
|
||||||
name: 'no-cuda-wheel'
|
|
||||||
path: ./dist/*.whl
|
|
41
README.md
41
README.md
|
@ -12,14 +12,15 @@
|
||||||
<p>
|
<p>
|
||||||
<b>English</b> |
|
<b>English</b> |
|
||||||
<a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README_zh.md">中文</a>
|
<a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README_zh.md">中文</a>
|
||||||
<p>
|
</p>
|
||||||
</h4>
|
</h4>
|
||||||
|
|
||||||
|
*<center>📣 Long time no see! 👋 Architecture upgrade, performance optimization and more new features will come in July and August, stay tune! 🥂</center>*
|
||||||
|
|
||||||
## News or Update
|
## News or Update
|
||||||
|
|
||||||
**To experience adapter training using `auto_gptq` quantized model in advance, you can try [this branch](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration) and discuss [in here](https://github.com/PanQiWei/AutoGPTQ/issues/103), examples are [in here](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration/examples/peft).**
|
- 2023-07-26 - (Update) - An elegant [PPL benchmark script](examples/benchmark/perplexity.py) to get results that can be fairly compared with other libraries such as `llama.cpp`.
|
||||||
|
- 2023-06-05 - (Update) - Integrate with 🤗 peft to use gptq quantized model to train adapters, support LoRA, AdaLoRA, AdaptionPrompt, etc.
|
||||||
- 2023-05-25 - (In Progress) - Integrate with 🤗 peft to use gptq quantized model to train adapters, support LoRA, AdaLoRA, AdaptionPrompt, etc.
|
|
||||||
- 2023-05-30 - (Update) - Support download/upload quantized model from/to 🤗 Hub.
|
- 2023-05-30 - (Update) - Support download/upload quantized model from/to 🤗 Hub.
|
||||||
- 2023-05-27 - (Update) - Support quantization and inference for `gpt_bigcode`, `codegen` and `RefineWeb/RefineWebModel`(falcon) model types.
|
- 2023-05-27 - (Update) - Support quantization and inference for `gpt_bigcode`, `codegen` and `RefineWeb/RefineWebModel`(falcon) model types.
|
||||||
- 2023-05-04 - (Update) - Support using faster cuda kernel when `not desc_act or group_size == -1`.
|
- 2023-05-04 - (Update) - Support using faster cuda kernel when `not desc_act or group_size == -1`.
|
||||||
|
@ -69,11 +70,7 @@ And to make sure `autogptq_cuda` is not ever in your virtual environment, run:
|
||||||
```shell
|
```shell
|
||||||
pip uninstall autogptq_cuda -y
|
pip uninstall autogptq_cuda -y
|
||||||
```
|
```
|
||||||
#### to support LLaMa model
|
|
||||||
For some people want to try LLaMa and whose `transformers` version not meet the newest one that supports it, using:
|
|
||||||
```shell
|
|
||||||
pip install auto-gptq[llama]
|
|
||||||
```
|
|
||||||
#### to support triton speedup
|
#### to support triton speedup
|
||||||
To integrate with `triton`, using:
|
To integrate with `triton`, using:
|
||||||
> warning: currently triton only supports linux; 3-bit quantization is not supported when using triton
|
> warning: currently triton only supports linux; 3-bit quantization is not supported when using triton
|
||||||
|
@ -96,8 +93,6 @@ pip install .
|
||||||
```
|
```
|
||||||
Like quick installation, you can also set `BUILD_CUDA_EXT=0` to disable pytorch extension building.
|
Like quick installation, you can also set `BUILD_CUDA_EXT=0` to disable pytorch extension building.
|
||||||
|
|
||||||
Use `.[llama]` if you want to try LLaMa model.
|
|
||||||
|
|
||||||
Use `.[triton]` if you want to integrate with triton and it's available on your operating system.
|
Use `.[triton]` if you want to integrate with triton and it's available on your operating system.
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
@ -304,18 +299,18 @@ print(
|
||||||
>
|
>
|
||||||
> for example, model_type of `WizardLM`, `vicuna` and `gpt4all` are all `llama`, hence they are all supported by `auto_gptq`.
|
> for example, model_type of `WizardLM`, `vicuna` and `gpt4all` are all `llama`, hence they are all supported by `auto_gptq`.
|
||||||
|
|
||||||
| model type | quantization | inference | peft-lora | peft-adaption_prompt |
|
| model type | quantization | inference | peft-lora | peft-ada-lora | peft-adaption_prompt |
|
||||||
|------------------------------------|--------------|-----------|-----------|----------------------|
|
|------------------------------------|--------------|-----------|-----------|---------------|-------------------------------------------------------------------------------------------------|
|
||||||
| bloom | ✅ | ✅ | | |
|
| bloom | ✅ | ✅ | ✅ | ✅ | |
|
||||||
| gpt2 | ✅ | ✅ | | |
|
| gpt2 | ✅ | ✅ | ✅ | ✅ | |
|
||||||
| gpt_neox | ✅ | ✅ | | |
|
| gpt_neox | ✅ | ✅ | ✅ | ✅ | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
|
||||||
| gptj | ✅ | ✅ | | |
|
| gptj | ✅ | ✅ | ✅ | ✅ | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
|
||||||
| llama | ✅ | ✅ | | ✅ |
|
| llama | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| moss | ✅ | ✅ | | |
|
| moss | ✅ | ✅ | ✅ | ✅ | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
|
||||||
| opt | ✅ | ✅ | | |
|
| opt | ✅ | ✅ | ✅ | ✅ | |
|
||||||
| gpt_bigcode | ✅ | ✅ | | |
|
| gpt_bigcode | ✅ | ✅ | ✅ | ✅ | |
|
||||||
| codegen | ✅ | ✅ | | |
|
| codegen | ✅ | ✅ | ✅ | ✅ | |
|
||||||
| falcon(RefinedWebModel/RefinedWeb) | ✅ | ✅ | | |
|
| falcon(RefinedWebModel/RefinedWeb) | ✅ | ✅ | ✅ | ✅ | |
|
||||||
|
|
||||||
## Supported Evaluation Tasks
|
## Supported Evaluation Tasks
|
||||||
Currently, `auto_gptq` supports: `LanguageModelingTask`, `SequenceClassificationTask` and `TextSummarizationTask`; more Tasks will come soon!
|
Currently, `auto_gptq` supports: `LanguageModelingTask`, `SequenceClassificationTask` and `TextSummarizationTask`; more Tasks will come soon!
|
||||||
|
|
41
README_zh.md
41
README_zh.md
|
@ -12,14 +12,15 @@
|
||||||
<p>
|
<p>
|
||||||
<a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README.md">English</a> |
|
<a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README.md">English</a> |
|
||||||
<b>中文</b>
|
<b>中文</b>
|
||||||
<p>
|
</p>
|
||||||
</h4>
|
</h4>
|
||||||
|
|
||||||
|
*<center>📣 好久不见!👋 七月和八月将会迎来架构升级,性能优化和新特性,敬请关注!🥂</center>*
|
||||||
|
|
||||||
## 新闻或更新
|
## 新闻或更新
|
||||||
|
|
||||||
**提前体验使用 `auto_gptq` 量化过的模型来训练适应层,你可以尝试[这个分支](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration) 并在[这里](https://github.com/PanQiWei/AutoGPTQ/issues/103)进行讨论,你也可以参考[这里](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration/examples/peft)所提供的示例脚本。**
|
- 2023-07-26 - (更新) - 一个优雅的 [PPL 测评脚本](examples/benchmark/perplexity.py)以获得可以与诸如 `llama.cpp` 等代码库进行公平比较的结果。
|
||||||
|
- 2023-06-05 - (更新) - 集成 🤗 peft 来使用 gptq 量化过的模型训练适应层,支持 LoRA,AdaLoRA,AdaptionPrompt 等。
|
||||||
- 2023-05-25 - (开发中) - 集成 🤗 peft 来使用 gptq 量化过的模型训练适应层,支持 LoRA,AdaLoRA,AdaptionPrompt 等。
|
|
||||||
- 2023-05-30 - (更新) - 支持从 🤗 Hub 下载量化好的模型或上次量化好的模型到 🤗 Hub。
|
- 2023-05-30 - (更新) - 支持从 🤗 Hub 下载量化好的模型或上次量化好的模型到 🤗 Hub。
|
||||||
- 2023-05-27 - (更新) - 支持以下模型的量化和推理: `gpt_bigcode`, `codegen` 以及 `RefineWeb/RefineWebModel`(falcon)。
|
- 2023-05-27 - (更新) - 支持以下模型的量化和推理: `gpt_bigcode`, `codegen` 以及 `RefineWeb/RefineWebModel`(falcon)。
|
||||||
- 2023-05-04 - (更新) - 支持在 `not desc_act or group_size == -1` 的情况下使用更快的 cuda 算子。
|
- 2023-05-04 - (更新) - 支持在 `not desc_act or group_size == -1` 的情况下使用更快的 cuda 算子。
|
||||||
|
@ -69,11 +70,7 @@ BUILD_CUDA_EXT=0 pip install auto-gptq
|
||||||
```shell
|
```shell
|
||||||
pip uninstall autogptq_cuda -y
|
pip uninstall autogptq_cuda -y
|
||||||
```
|
```
|
||||||
#### 支持使用 LLaMa 模型
|
|
||||||
若想要尝试 LLaMa 模型,但 `transformers` 版本不为支持该模型的最新版本,使用以下命令:
|
|
||||||
```shell
|
|
||||||
pip install auto-gptq[llama]
|
|
||||||
```
|
|
||||||
#### 支持使用 triton 加速
|
#### 支持使用 triton 加速
|
||||||
若想使用 `triton` 加速模型推理,使用以下命令:
|
若想使用 `triton` 加速模型推理,使用以下命令:
|
||||||
> 警告:目前 triton 仅支持 linux 操作系统;当使用 triton 时 3-bit 数值类型的量化将不被支持
|
> 警告:目前 triton 仅支持 linux 操作系统;当使用 triton 时 3-bit 数值类型的量化将不被支持
|
||||||
|
@ -96,8 +93,6 @@ pip install .
|
||||||
```
|
```
|
||||||
正如在快速安装一节,你可以使用 `BUILD_CUDA_EXT=0` 来取消构建 cuda 拓展。
|
正如在快速安装一节,你可以使用 `BUILD_CUDA_EXT=0` 来取消构建 cuda 拓展。
|
||||||
|
|
||||||
如果你想要使用 LLaMa 模型,请使用 `.[llama]`。
|
|
||||||
|
|
||||||
如果你想要使用 triton 加速且其能够被你的操作系统所支持,请使用 `.[triton]`。
|
如果你想要使用 triton 加速且其能够被你的操作系统所支持,请使用 `.[triton]`。
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
@ -303,18 +298,18 @@ print(
|
||||||
>
|
>
|
||||||
> 比如, `WizardLM`,`vicuna` 和 `gpt4all` 模型的 `model_type` 皆为 `llama`, 因此这些模型皆被 `auto_gptq` 所支持。
|
> 比如, `WizardLM`,`vicuna` 和 `gpt4all` 模型的 `model_type` 皆为 `llama`, 因此这些模型皆被 `auto_gptq` 所支持。
|
||||||
|
|
||||||
| model type | quantization | inference | peft-lora | peft-adaption_prompt |
|
| model type | quantization | inference | peft-lora | peft-ada-lora | peft-adaption_prompt |
|
||||||
|------------------------------------|--------------|-----------|-----------|----------------------|
|
|------------------------------------|--------------|-----------|-----------|---------------|-----------------------------------------------------------------------------------|
|
||||||
| bloom | ✅ | ✅ | | |
|
| bloom | ✅ | ✅ | ✅ | ✅ | |
|
||||||
| gpt2 | ✅ | ✅ | | |
|
| gpt2 | ✅ | ✅ | ✅ | ✅ | |
|
||||||
| gpt_neox | ✅ | ✅ | | |
|
| gpt_neox | ✅ | ✅ | ✅ | ✅ | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
|
||||||
| gptj | ✅ | ✅ | | |
|
| gptj | ✅ | ✅ | ✅ | ✅ | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
|
||||||
| llama | ✅ | ✅ | | ✅ |
|
| llama | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| moss | ✅ | ✅ | | |
|
| moss | ✅ | ✅ | ✅ | ✅ | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
|
||||||
| opt | ✅ | ✅ | | |
|
| opt | ✅ | ✅ | ✅ | ✅ | |
|
||||||
| gpt_bigcode | ✅ | ✅ | | |
|
| gpt_bigcode | ✅ | ✅ | ✅ | ✅ | |
|
||||||
| codegen | ✅ | ✅ | | |
|
| codegen | ✅ | ✅ | ✅ | ✅ | |
|
||||||
| falcon(RefinedWebModel/RefinedWeb) | ✅ | ✅ | | |
|
| falcon(RefinedWebModel/RefinedWeb) | ✅ | ✅ | ✅ | ✅ | |
|
||||||
|
|
||||||
## 支持的评估任务
|
## 支持的评估任务
|
||||||
目前, `auto_gptq` 支持以下评估任务: `LanguageModelingTask`, `SequenceClassificationTask` 和 `TextSummarizationTask`;更多的评估任务即将到来!
|
目前, `auto_gptq` 支持以下评估任务: `LanguageModelingTask`, `SequenceClassificationTask` 和 `TextSummarizationTask`;更多的评估任务即将到来!
|
||||||
|
|
|
@ -1,2 +1,4 @@
|
||||||
|
__version__ = "0.3.2"
|
||||||
from .modeling import BaseQuantizeConfig
|
from .modeling import BaseQuantizeConfig
|
||||||
from .modeling import AutoGPTQForCausalLM
|
from .modeling import AutoGPTQForCausalLM
|
||||||
|
from .utils.peft_utils import get_gptq_peft_model
|
||||||
|
|
|
@ -10,4 +10,6 @@ from .opt import *
|
||||||
from .rw import *
|
from .rw import *
|
||||||
from .gpt_bigcode import *
|
from .gpt_bigcode import *
|
||||||
from .codegen import *
|
from .codegen import *
|
||||||
|
from .baichuan import *
|
||||||
|
from .internlm import *
|
||||||
from .mpt import *
|
from .mpt import *
|
|
@ -20,10 +20,11 @@ from transformers.modeling_utils import no_init_weights
|
||||||
|
|
||||||
from ._const import *
|
from ._const import *
|
||||||
from ._utils import *
|
from ._utils import *
|
||||||
|
from ..nn_modules.qlinear import GeneralQuantLinear
|
||||||
from ..nn_modules._fused_base import FusedBaseAttentionModule, FusedBaseMLPModule
|
from ..nn_modules._fused_base import FusedBaseAttentionModule, FusedBaseMLPModule
|
||||||
from ..quantization import GPTQ
|
from ..quantization import GPTQ
|
||||||
from ..utils.data_utils import collate_data
|
from ..utils.data_utils import collate_data
|
||||||
from ..utils.import_utils import TRITON_AVAILABLE
|
from ..utils.import_utils import dynamically_import_QuantLinear, TRITON_AVAILABLE, AUTOGPTQ_CUDA_AVAILABLE
|
||||||
|
|
||||||
logger = getLogger(__name__)
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
@ -112,7 +113,16 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
fused_attn_module_type: Optional[FusedBaseAttentionModule] = None
|
fused_attn_module_type: Optional[FusedBaseAttentionModule] = None
|
||||||
fused_mlp_module_type: Optional[FusedBaseMLPModule] = None
|
fused_mlp_module_type: Optional[FusedBaseMLPModule] = None
|
||||||
|
|
||||||
def __init__(self, model: PreTrainedModel, quantized: bool, quantize_config: BaseQuantizeConfig):
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: PreTrainedModel,
|
||||||
|
quantized: bool,
|
||||||
|
quantize_config: BaseQuantizeConfig,
|
||||||
|
is_triton_backend: bool = False,
|
||||||
|
injected_fused_attention: bool = False,
|
||||||
|
injected_fused_mlp: bool = False,
|
||||||
|
trainable: bool = False
|
||||||
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -121,6 +131,11 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
self.quantize_config = quantize_config
|
self.quantize_config = quantize_config
|
||||||
self.config = self.model.config
|
self.config = self.model.config
|
||||||
|
|
||||||
|
self.is_triton_backend = is_triton_backend
|
||||||
|
self.injected_fused_attention = injected_fused_attention
|
||||||
|
self.injected_fused_mlp = injected_fused_mlp
|
||||||
|
self.trainable = trainable
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def quantized(self):
|
def quantized(self):
|
||||||
return self._quantized
|
return self._quantized
|
||||||
|
@ -431,6 +446,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
repo_id: str,
|
repo_id: str,
|
||||||
save_dir: Optional[str] = None,
|
save_dir: Optional[str] = None,
|
||||||
use_safetensors: Optional[bool] = True,
|
use_safetensors: Optional[bool] = True,
|
||||||
|
safetensors_metadata: Optional[Dict[str, str]] = None,
|
||||||
commit_message: Optional[str] = "Upload of AutoGPTQ quantized model",
|
commit_message: Optional[str] = "Upload of AutoGPTQ quantized model",
|
||||||
use_auth_token: Optional[Union[bool, str]] = None,
|
use_auth_token: Optional[Union[bool, str]] = None,
|
||||||
private: Optional[bool] = None,
|
private: Optional[bool] = None,
|
||||||
|
@ -450,6 +466,10 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
use_safetensors (`bool`, *optional*):
|
use_safetensors (`bool`, *optional*):
|
||||||
Save the model using `safetensors`.
|
Save the model using `safetensors`.
|
||||||
If the model has already been saved, this parameter can be omitted.
|
If the model has already been saved, this parameter can be omitted.
|
||||||
|
safetensors_metadata: (`dict`, *optional*, defaults to `None`):
|
||||||
|
Pass optional metadata dictionary to be saved in the `safetensors` model file(s).
|
||||||
|
Metadata is optional and is purely for informational purposes. It does not affect inference.
|
||||||
|
If `None`, no metadata will be saved.
|
||||||
commit_message (`str`, *optional*, defaults to `"Upload tool"`):
|
commit_message (`str`, *optional*, defaults to `"Upload tool"`):
|
||||||
Message to commit while pushing.
|
Message to commit while pushing.
|
||||||
use_auth_token (`bool` or `str`, *optional*):
|
use_auth_token (`bool` or `str`, *optional*):
|
||||||
|
@ -469,7 +489,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
|
|
||||||
if save_dir is not None:
|
if save_dir is not None:
|
||||||
logger.info(f"Saving model to {save_dir}")
|
logger.info(f"Saving model to {save_dir}")
|
||||||
self.save_quantized(save_dir, use_safetensors)
|
self.save_quantized(save_dir, use_safetensors, safetensors_metadata)
|
||||||
|
|
||||||
repo_url = create_repo(
|
repo_url = create_repo(
|
||||||
repo_id=repo_id, token=token, private=private, exist_ok=True, repo_type="model"
|
repo_id=repo_id, token=token, private=private, exist_ok=True, repo_type="model"
|
||||||
|
@ -492,7 +512,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
repo_type="model",
|
repo_type="model",
|
||||||
)
|
)
|
||||||
|
|
||||||
def save_quantized(self, save_dir: str, use_safetensors: bool = False):
|
def save_quantized(self, save_dir: str, use_safetensors: bool = False, safetensors_metadata: Optional[Dict[str, str]] = None):
|
||||||
"""save quantized model and configs to local disk"""
|
"""save quantized model and configs to local disk"""
|
||||||
os.makedirs(save_dir, exist_ok=True)
|
os.makedirs(save_dir, exist_ok=True)
|
||||||
|
|
||||||
|
@ -506,7 +526,42 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
model_save_name = model_base_name + ".safetensors"
|
model_save_name = model_base_name + ".safetensors"
|
||||||
state_dict = self.model.state_dict()
|
state_dict = self.model.state_dict()
|
||||||
state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()}
|
state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()}
|
||||||
safe_save(state_dict, join(save_dir, model_save_name))
|
if safetensors_metadata is None:
|
||||||
|
safetensors_metadata = {}
|
||||||
|
elif not isinstance(safetensors_metadata, dict):
|
||||||
|
raise TypeError("safetensors_metadata must be a dictionary.")
|
||||||
|
else:
|
||||||
|
logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
|
||||||
|
new_safetensors_metadata = {}
|
||||||
|
converted_keys = False
|
||||||
|
for key, value in safetensors_metadata.items():
|
||||||
|
if not isinstance(key, str) or not isinstance(value, str):
|
||||||
|
converted_keys = True
|
||||||
|
try:
|
||||||
|
new_key = str(key)
|
||||||
|
new_value = str(value)
|
||||||
|
except Exception as e:
|
||||||
|
raise TypeError(f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}")
|
||||||
|
if new_key in new_safetensors_metadata:
|
||||||
|
logger.warning(f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting.")
|
||||||
|
new_safetensors_metadata[new_key] = new_value
|
||||||
|
safetensors_metadata = new_safetensors_metadata
|
||||||
|
if converted_keys:
|
||||||
|
logger.debug(f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}")
|
||||||
|
|
||||||
|
# Format is required to enable Accelerate to load the metadata
|
||||||
|
# otherwise it raises an OSError
|
||||||
|
safetensors_metadata['format'] = "pt"
|
||||||
|
|
||||||
|
# Store the quantization configuration as safetensors metadata
|
||||||
|
from auto_gptq import __version__
|
||||||
|
safetensors_metadata['auto_gptq_version'] = str(__version__)
|
||||||
|
safetensors_metadata['gptq_bits'] = str(self.quantize_config.bits)
|
||||||
|
safetensors_metadata['gptq_group_size'] = str(self.quantize_config.group_size)
|
||||||
|
safetensors_metadata['gptq_desc_act'] = str(self.quantize_config.desc_act)
|
||||||
|
safetensors_metadata['gptq_damp_percent'] = str(self.quantize_config.damp_percent)
|
||||||
|
|
||||||
|
safe_save(state_dict, join(save_dir, model_save_name), safetensors_metadata)
|
||||||
else:
|
else:
|
||||||
model_save_name = model_base_name + ".bin"
|
model_save_name = model_base_name + ".bin"
|
||||||
torch.save(self.model.state_dict(), join(save_dir, model_save_name))
|
torch.save(self.model.state_dict(), join(save_dir, model_save_name))
|
||||||
|
@ -516,10 +571,10 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
self.quantize_config.model_name_or_path = save_dir
|
self.quantize_config.model_name_or_path = save_dir
|
||||||
self.quantize_config.model_file_base_name = model_base_name
|
self.quantize_config.model_file_base_name = model_base_name
|
||||||
|
|
||||||
def save_pretrained(self, save_dir: str, use_safetensors: bool = False, **kwargs):
|
def save_pretrained(self, save_dir: str, use_safetensors: bool = False, safetensors_metadata: Optional[Dict[str, str]] = None, **kwargs):
|
||||||
"""alias of save_quantized"""
|
"""alias of save_quantized"""
|
||||||
logger.warning("you are using save_pretrained, which will re-direct to save_quantized.")
|
logger.warning("you are using save_pretrained, which will re-direct to save_quantized.")
|
||||||
self.save_quantized(save_dir, use_safetensors)
|
self.save_quantized(save_dir, use_safetensors, safetensors_metadata)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(
|
def from_pretrained(
|
||||||
|
@ -543,7 +598,29 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
torch.nn.init.uniform_ = skip
|
torch.nn.init.uniform_ = skip
|
||||||
torch.nn.init.normal_ = skip
|
torch.nn.init.normal_ = skip
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
|
# Parameters related to loading from Hugging Face Hub
|
||||||
|
cache_dir = model_init_kwargs.pop("cache_dir", None)
|
||||||
|
force_download = model_init_kwargs.pop("force_download", False)
|
||||||
|
resume_download = model_init_kwargs.pop("resume_download", False)
|
||||||
|
proxies = model_init_kwargs.pop("proxies", None)
|
||||||
|
local_files_only = model_init_kwargs.pop("local_files_only", False)
|
||||||
|
use_auth_token = model_init_kwargs.pop("use_auth_token", None)
|
||||||
|
revision = model_init_kwargs.pop("revision", None)
|
||||||
|
subfolder = model_init_kwargs.pop("subfolder", "")
|
||||||
|
commit_hash = model_init_kwargs.pop("_commit_hash", None)
|
||||||
|
|
||||||
|
cached_file_kwargs = {
|
||||||
|
"cache_dir": cache_dir,
|
||||||
|
"force_download": force_download,
|
||||||
|
"proxies": proxies,
|
||||||
|
"resume_download": resume_download,
|
||||||
|
"local_files_only": local_files_only,
|
||||||
|
"use_auth_token": use_auth_token,
|
||||||
|
"revision": revision,
|
||||||
|
"subfolder": subfolder,
|
||||||
|
}
|
||||||
|
|
||||||
|
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True, **cached_file_kwargs)
|
||||||
if config.model_type not in SUPPORTED_MODELS:
|
if config.model_type not in SUPPORTED_MODELS:
|
||||||
raise TypeError(f"{config.model_type} isn't supported yet.")
|
raise TypeError(f"{config.model_type} isn't supported yet.")
|
||||||
|
|
||||||
|
@ -579,7 +656,9 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
|
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **model_init_kwargs)
|
merged_kwargs = {**model_init_kwargs, **cached_file_kwargs}
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **merged_kwargs)
|
||||||
|
|
||||||
model_config = model.config.to_dict()
|
model_config = model.config.to_dict()
|
||||||
seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
|
seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
|
||||||
if any([k in model_config for k in seq_len_keys]):
|
if any([k in model_config for k in seq_len_keys]):
|
||||||
|
@ -597,8 +676,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_quantized(
|
def from_quantized(
|
||||||
cls,
|
cls,
|
||||||
model_name_or_path: Optional[str] = None,
|
model_name_or_path: Optional[str],
|
||||||
save_dir: Optional[str] = None,
|
|
||||||
device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
|
device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
|
||||||
max_memory: Optional[dict] = None,
|
max_memory: Optional[dict] = None,
|
||||||
device: Optional[Union[str, int]] = None,
|
device: Optional[Union[str, int]] = None,
|
||||||
|
@ -613,6 +691,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
use_safetensors: bool = False,
|
use_safetensors: bool = False,
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
warmup_triton: bool = False,
|
warmup_triton: bool = False,
|
||||||
|
trainable: bool = False,
|
||||||
**kwargs
|
**kwargs
|
||||||
):
|
):
|
||||||
"""load quantized model from local disk"""
|
"""load quantized model from local disk"""
|
||||||
|
@ -628,20 +707,25 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
subfolder = kwargs.pop("subfolder", "")
|
subfolder = kwargs.pop("subfolder", "")
|
||||||
commit_hash = kwargs.pop("_commit_hash", None)
|
commit_hash = kwargs.pop("_commit_hash", None)
|
||||||
|
|
||||||
|
cached_file_kwargs = {
|
||||||
|
"cache_dir": cache_dir,
|
||||||
|
"force_download": force_download,
|
||||||
|
"proxies": proxies,
|
||||||
|
"resume_download": resume_download,
|
||||||
|
"local_files_only": local_files_only,
|
||||||
|
"use_auth_token": use_auth_token,
|
||||||
|
"revision": revision,
|
||||||
|
"subfolder": subfolder,
|
||||||
|
"_raise_exceptions_for_missing_entries": False,
|
||||||
|
"_commit_hash": commit_hash,
|
||||||
|
}
|
||||||
|
|
||||||
if use_triton and not TRITON_AVAILABLE:
|
if use_triton and not TRITON_AVAILABLE:
|
||||||
logger.warning("triton is not installed, reset use_triton to False")
|
logger.warning("triton is not installed, reset use_triton to False")
|
||||||
use_triton = False
|
use_triton = False
|
||||||
|
|
||||||
# == step1: prepare configs and file names == #
|
# == step1: prepare configs and file names == #
|
||||||
if model_name_or_path and save_dir:
|
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code, **cached_file_kwargs)
|
||||||
logger.warning("save_dir will be ignored because model_name_or_path is explicit specified.")
|
|
||||||
if not model_name_or_path and save_dir:
|
|
||||||
model_name_or_path = save_dir
|
|
||||||
warnings.warn("save_dir is deprecated and will be removed in version 0.3.0", PendingDeprecationWarning, stacklevel=2)
|
|
||||||
if not model_name_or_path and not save_dir:
|
|
||||||
raise ValueError("at least one of model_name_or_path or save_dir should be specified.")
|
|
||||||
|
|
||||||
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
|
|
||||||
|
|
||||||
if config.model_type not in SUPPORTED_MODELS:
|
if config.model_type not in SUPPORTED_MODELS:
|
||||||
raise TypeError(f"{config.model_type} isn't supported yet.")
|
raise TypeError(f"{config.model_type} isn't supported yet.")
|
||||||
|
@ -670,25 +754,11 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
resolved_archive_file = None
|
resolved_archive_file = None
|
||||||
if is_local:
|
if is_local:
|
||||||
model_save_name = join(model_name_or_path, model_basename)
|
model_save_name = join(model_name_or_path, model_basename)
|
||||||
|
|
||||||
for ext in extensions:
|
for ext in extensions:
|
||||||
if isfile(model_save_name + ext):
|
if isfile(model_save_name + ext):
|
||||||
resolved_archive_file = model_save_name + ext
|
resolved_archive_file = model_save_name + ext
|
||||||
break
|
break
|
||||||
else: # remote
|
else: # remote
|
||||||
cached_file_kwargs = {
|
|
||||||
"cache_dir": cache_dir,
|
|
||||||
"force_download": force_download,
|
|
||||||
"proxies": proxies,
|
|
||||||
"resume_download": resume_download,
|
|
||||||
"local_files_only": local_files_only,
|
|
||||||
"use_auth_token": use_auth_token,
|
|
||||||
"revision": revision,
|
|
||||||
"subfolder": subfolder,
|
|
||||||
"_raise_exceptions_for_missing_entries": False,
|
|
||||||
"_commit_hash": commit_hash,
|
|
||||||
}
|
|
||||||
|
|
||||||
for ext in extensions:
|
for ext in extensions:
|
||||||
resolved_archive_file = cached_file(model_name_or_path, model_basename + ext, **cached_file_kwargs)
|
resolved_archive_file = cached_file(model_name_or_path, model_basename + ext, **cached_file_kwargs)
|
||||||
if resolved_archive_file is not None:
|
if resolved_archive_file is not None:
|
||||||
|
@ -699,6 +769,9 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
|
|
||||||
model_save_name = resolved_archive_file
|
model_save_name = resolved_archive_file
|
||||||
|
|
||||||
|
if not use_triton and trainable:
|
||||||
|
logger.warning("QuantLinear with cuda backend not support trainable mode yet, Switch to the pytorch backend.")
|
||||||
|
|
||||||
# == step2: convert model to gptq-model (replace Linear with QuantLinear) == #
|
# == step2: convert model to gptq-model (replace Linear with QuantLinear) == #
|
||||||
def skip(*args, **kwargs):
|
def skip(*args, **kwargs):
|
||||||
pass
|
pass
|
||||||
|
@ -734,7 +807,8 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
quantize_config.group_size,
|
quantize_config.group_size,
|
||||||
use_triton=use_triton,
|
use_triton=use_triton,
|
||||||
use_cuda_fp16=use_cuda_fp16,
|
use_cuda_fp16=use_cuda_fp16,
|
||||||
desc_act=quantize_config.desc_act
|
desc_act=quantize_config.desc_act,
|
||||||
|
trainable=trainable
|
||||||
)
|
)
|
||||||
model.tie_weights()
|
model.tie_weights()
|
||||||
|
|
||||||
|
@ -794,6 +868,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
# == step5: (optional) inject optimized module == #
|
# == step5: (optional) inject optimized module == #
|
||||||
if inject_fused_attention:
|
if inject_fused_attention:
|
||||||
if cls.fused_attn_module_type is None:
|
if cls.fused_attn_module_type is None:
|
||||||
|
inject_fused_attention = False
|
||||||
logger.warning(f"{cls.__name__} hasn't fused attention module yet, will skip inject fused attention.")
|
logger.warning(f"{cls.__name__} hasn't fused attention module yet, will skip inject fused attention.")
|
||||||
else:
|
else:
|
||||||
cls.fused_attn_module_type.inject_to_model(
|
cls.fused_attn_module_type.inject_to_model(
|
||||||
|
@ -801,10 +876,12 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
use_triton=use_triton,
|
use_triton=use_triton,
|
||||||
group_size=quantize_config.group_size,
|
group_size=quantize_config.group_size,
|
||||||
use_cuda_fp16=use_cuda_fp16,
|
use_cuda_fp16=use_cuda_fp16,
|
||||||
desc_act=quantize_config.desc_act
|
desc_act=quantize_config.desc_act,
|
||||||
|
trainable=trainable
|
||||||
)
|
)
|
||||||
if inject_fused_mlp:
|
if inject_fused_mlp:
|
||||||
if cls.fused_mlp_module_type is None:
|
if cls.fused_mlp_module_type is None:
|
||||||
|
inject_fused_mlp = False
|
||||||
logger.warning(f"{cls.__name__} hasn't fused mlp module yet, will skip inject fused mlp.")
|
logger.warning(f"{cls.__name__} hasn't fused mlp module yet, will skip inject fused mlp.")
|
||||||
else:
|
else:
|
||||||
cls.fused_mlp_module_type.inject_to_model(
|
cls.fused_mlp_module_type.inject_to_model(
|
||||||
|
@ -815,13 +892,26 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
model.eval()
|
model.eval()
|
||||||
# == step6: (optional) warmup triton == #
|
# == step6: (optional) warmup triton == #
|
||||||
if use_triton and warmup_triton:
|
if use_triton and warmup_triton:
|
||||||
from ..nn_modules.qlinear_triton import QuantLinear
|
from ..nn_modules.qlinear.qlinear_triton import QuantLinear
|
||||||
QuantLinear.warmup(model, seqlen=model.seqlen)
|
QuantLinear.warmup(model, seqlen=model.seqlen)
|
||||||
|
|
||||||
if inject_fused_mlp and cls.fused_mlp_module_type is not None:
|
if inject_fused_mlp and cls.fused_mlp_module_type is not None:
|
||||||
cls.fused_mlp_module_type.warmup(model, seqlen=model.seqlen)
|
cls.fused_mlp_module_type.warmup(model, seqlen=model.seqlen)
|
||||||
|
|
||||||
return cls(model, True, quantize_config)
|
# == step7: make model compatible with peft
|
||||||
|
cls.make_sure_compatible_with_peft(
|
||||||
|
model, use_triton, quantize_config.desc_act, quantize_config.group_size
|
||||||
|
)
|
||||||
|
|
||||||
|
return cls(
|
||||||
|
model,
|
||||||
|
True,
|
||||||
|
quantize_config,
|
||||||
|
is_triton_backend=use_triton,
|
||||||
|
injected_fused_attention=inject_fused_attention,
|
||||||
|
injected_fused_mlp=inject_fused_mlp and use_triton,
|
||||||
|
trainable=trainable
|
||||||
|
)
|
||||||
|
|
||||||
def warmup_triton(self, enabled: bool = True):
|
def warmup_triton(self, enabled: bool = True):
|
||||||
if not enabled:
|
if not enabled:
|
||||||
|
@ -830,11 +920,34 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
|
||||||
logger.warning(f"triton is not available, skip warmup stage directly.")
|
logger.warning(f"triton is not available, skip warmup stage directly.")
|
||||||
return
|
return
|
||||||
|
|
||||||
from ..nn_modules.qlinear_triton import QuantLinear
|
from ..nn_modules.qlinear.qlinear_triton import QuantLinear
|
||||||
QuantLinear.warmup(self.model, seqlen=self.model.seqlen)
|
QuantLinear.warmup(self.model, seqlen=self.model.seqlen)
|
||||||
|
|
||||||
if self.fused_mlp_module_type is not None:
|
if self.fused_mlp_module_type is not None:
|
||||||
self.fused_mlp_module_type.warmup(self.model, seqlen=self.model.seqlen)
|
self.fused_mlp_module_type.warmup(self.model, seqlen=self.model.seqlen)
|
||||||
|
|
||||||
|
def enable_trainable_mode(self, enabled: bool = True):
|
||||||
|
if not self.is_triton_backend and enabled:
|
||||||
|
raise NotImplementedError("For now, trainable mode only supports triton backend.")
|
||||||
|
for n, m in self.model.named_modules():
|
||||||
|
if hasattr(m, "trainable"):
|
||||||
|
setattr(m, "trainable", enabled)
|
||||||
|
|
||||||
|
def disable_trainable_mode(self):
|
||||||
|
self.enable_trainable_mode(enabled=False)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def make_sure_compatible_with_peft(model: PreTrainedModel, use_triton: bool, desc_act: bool, group_size: int):
|
||||||
|
GeneralQuantLinear.inject_to_model(
|
||||||
|
model,
|
||||||
|
dynamically_import_QuantLinear(use_triton, desc_act, group_size)
|
||||||
|
)
|
||||||
|
|
||||||
|
def __getattr__(self, item):
|
||||||
|
try:
|
||||||
|
return super().__getattr__(item)
|
||||||
|
except:
|
||||||
|
return getattr(self.model, item)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["BaseGPTQForCausalLM", "BaseQuantizeConfig"]
|
__all__ = ["BaseGPTQForCausalLM", "BaseQuantizeConfig"]
|
||||||
|
|
|
@ -1,12 +1,27 @@
|
||||||
from packaging.version import parse as parse_version
|
from packaging.version import parse as parse_version
|
||||||
|
|
||||||
from torch import device
|
from torch import device
|
||||||
from transformers import __version__ as transformers_version
|
|
||||||
|
from ..utils.import_utils import compare_transformers_version
|
||||||
|
|
||||||
CPU = device("cpu")
|
CPU = device("cpu")
|
||||||
CUDA_0 = device("cuda:0")
|
CUDA_0 = device("cuda:0")
|
||||||
|
|
||||||
SUPPORTED_MODELS = ["bloom", "gptj", "gpt2", "gpt_neox", "opt", "moss", "gpt_bigcode", "codegen", "RefinedWebModel", "RefinedWeb", "mpt"]
|
SUPPORTED_MODELS = [
|
||||||
|
"bloom",
|
||||||
|
"gptj",
|
||||||
|
"gpt2",
|
||||||
|
"gpt_neox",
|
||||||
|
"opt",
|
||||||
|
"moss",
|
||||||
|
"gpt_bigcode",
|
||||||
|
"codegen",
|
||||||
|
"RefinedWebModel",
|
||||||
|
"RefinedWeb",
|
||||||
|
"baichuan",
|
||||||
|
"internlm",
|
||||||
|
"mpt",
|
||||||
|
]
|
||||||
if compare_transformers_version("v4.28.0", op="ge"):
|
if compare_transformers_version("v4.28.0", op="ge"):
|
||||||
SUPPORTED_MODELS.append("llama")
|
SUPPORTED_MODELS.append("llama")
|
||||||
|
|
||||||
|
|
|
@ -50,7 +50,17 @@ def get_module_by_name_suffix(model, module_name: str):
|
||||||
return module
|
return module
|
||||||
|
|
||||||
|
|
||||||
def make_quant(module, names, bits, group_size, name='', use_triton=False, use_cuda_fp16=True, desc_act=False):
|
def make_quant(
|
||||||
|
module,
|
||||||
|
names,
|
||||||
|
bits,
|
||||||
|
group_size,
|
||||||
|
name='',
|
||||||
|
use_triton=False,
|
||||||
|
use_cuda_fp16=True,
|
||||||
|
desc_act=False,
|
||||||
|
trainable=False
|
||||||
|
):
|
||||||
QuantLinear = dynamically_import_QuantLinear(use_triton=use_triton, desc_act=desc_act, group_size=group_size)
|
QuantLinear = dynamically_import_QuantLinear(use_triton=use_triton, desc_act=desc_act, group_size=group_size)
|
||||||
|
|
||||||
if isinstance(module, QuantLinear):
|
if isinstance(module, QuantLinear):
|
||||||
|
@ -71,13 +81,25 @@ def make_quant(module, names, bits, group_size, name='', use_triton=False, use_c
|
||||||
in_features = tmp.weight.shape[0]
|
in_features = tmp.weight.shape[0]
|
||||||
out_features = tmp.weight.shape[1]
|
out_features = tmp.weight.shape[1]
|
||||||
if (not(desc_act) or group_size == -1) and not use_triton:
|
if (not(desc_act) or group_size == -1) and not use_triton:
|
||||||
new_layer = QuantLinear(bits, group_size, in_features, out_features, True, use_cuda_fp16=use_cuda_fp16)
|
new_layer = QuantLinear(
|
||||||
|
bits, group_size, in_features, out_features, True, use_cuda_fp16=use_cuda_fp16, trainable=trainable
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
new_layer = QuantLinear(bits, group_size, in_features, out_features, True)
|
new_layer = QuantLinear(bits, group_size, in_features, out_features, True, trainable=trainable)
|
||||||
new_layer.device = ori_layer_device
|
new_layer.device = ori_layer_device
|
||||||
setattr(module, attr, new_layer.to(ori_layer_device))
|
setattr(module, attr, new_layer.to(ori_layer_device))
|
||||||
for name1, child in module.named_children():
|
for name1, child in module.named_children():
|
||||||
make_quant(child, names, bits, group_size, name + '.' + name1 if name != '' else name1, use_triton=use_triton, use_cuda_fp16=use_cuda_fp16,desc_act=desc_act)
|
make_quant(
|
||||||
|
child,
|
||||||
|
names,
|
||||||
|
bits,
|
||||||
|
group_size,
|
||||||
|
name + '.' + name1 if name != '' else name1,
|
||||||
|
use_triton=use_triton,
|
||||||
|
use_cuda_fp16=use_cuda_fp16,
|
||||||
|
desc_act=desc_act,
|
||||||
|
trainable=trainable
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def pack_model(
|
def pack_model(
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from typing import Optional
|
from inspect import signature
|
||||||
|
from typing import Dict, Optional, Union
|
||||||
|
|
||||||
from ._base import BaseQuantizeConfig, BaseGPTQForCausalLM
|
from ._base import BaseQuantizeConfig, BaseGPTQForCausalLM
|
||||||
from ._utils import check_and_get_model_type
|
from ._utils import check_and_get_model_type
|
||||||
|
@ -12,6 +13,8 @@ from .moss import MOSSGPTQForCausalLM
|
||||||
from .opt import OPTGPTQForCausalLM
|
from .opt import OPTGPTQForCausalLM
|
||||||
from .rw import RWGPTQForCausalLM
|
from .rw import RWGPTQForCausalLM
|
||||||
from .gpt_bigcode import GPTBigCodeGPTQForCausalLM
|
from .gpt_bigcode import GPTBigCodeGPTQForCausalLM
|
||||||
|
from .baichuan import BaiChuanGPTQForCausalLM
|
||||||
|
from .internlm import InternLMGPTQForCausalLM
|
||||||
from .mpt import MPTGPTQForCausalLM
|
from .mpt import MPTGPTQForCausalLM
|
||||||
|
|
||||||
|
|
||||||
|
@ -26,8 +29,10 @@ GPTQ_CAUSAL_LM_MODEL_MAP = {
|
||||||
"gpt_bigcode": GPTBigCodeGPTQForCausalLM,
|
"gpt_bigcode": GPTBigCodeGPTQForCausalLM,
|
||||||
"codegen": CodeGenGPTQForCausalLM,
|
"codegen": CodeGenGPTQForCausalLM,
|
||||||
"RefinedWebModel": RWGPTQForCausalLM,
|
"RefinedWebModel": RWGPTQForCausalLM,
|
||||||
"RefinedWeb":RWGPTQForCausalLM,
|
"RefinedWeb": RWGPTQForCausalLM,
|
||||||
"mpt": MPTGPTQForCausalLM
|
"baichuan": BaiChuanGPTQForCausalLM,
|
||||||
|
"internlm": InternLMGPTQForCausalLM,
|
||||||
|
"mpt": MPTGPTQForCausalLM,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -48,7 +53,9 @@ class AutoGPTQForCausalLM:
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
**model_init_kwargs
|
**model_init_kwargs
|
||||||
) -> BaseGPTQForCausalLM:
|
) -> BaseGPTQForCausalLM:
|
||||||
model_type = check_and_get_model_type(pretrained_model_name_or_path, trust_remote_code)
|
model_type = check_and_get_model_type(
|
||||||
|
pretrained_model_name_or_path, trust_remote_code
|
||||||
|
)
|
||||||
return GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_pretrained(
|
return GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_pretrained(
|
||||||
pretrained_model_name_or_path=pretrained_model_name_or_path,
|
pretrained_model_name_or_path=pretrained_model_name_or_path,
|
||||||
quantize_config=quantize_config,
|
quantize_config=quantize_config,
|
||||||
|
@ -60,8 +67,7 @@ class AutoGPTQForCausalLM:
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_quantized(
|
def from_quantized(
|
||||||
cls,
|
cls,
|
||||||
model_name_or_path: Optional[str] = None,
|
model_name_or_path: Optional[str],
|
||||||
save_dir: Optional[str] = None,
|
|
||||||
device_map: Optional[Union[str, Dict[str, Union[str, int]]]] = None,
|
device_map: Optional[Union[str, Dict[str, Union[str, int]]]] = None,
|
||||||
max_memory: Optional[dict] = None,
|
max_memory: Optional[dict] = None,
|
||||||
device: Optional[Union[str, int]] = None,
|
device: Optional[Union[str, int]] = None,
|
||||||
|
@ -75,14 +81,32 @@ class AutoGPTQForCausalLM:
|
||||||
use_safetensors: bool = False,
|
use_safetensors: bool = False,
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
warmup_triton: bool = False,
|
warmup_triton: bool = False,
|
||||||
|
trainable: bool = False,
|
||||||
**kwargs
|
**kwargs
|
||||||
) -> BaseGPTQForCausalLM:
|
) -> BaseGPTQForCausalLM:
|
||||||
model_type = check_and_get_model_type(save_dir or model_name_or_path, trust_remote_code)
|
model_type = check_and_get_model_type(model_name_or_path, trust_remote_code)
|
||||||
quant_func = GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized
|
quant_func = GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized
|
||||||
keywords = {key: kwargs[key] for key in signature(quant_func).parameters if key in kwargs}
|
# A static list of kwargs needed for huggingface_hub
|
||||||
|
huggingface_kwargs = [
|
||||||
|
"cache_dir",
|
||||||
|
"force_download",
|
||||||
|
"proxies",
|
||||||
|
"resume_download",
|
||||||
|
"local_files_only",
|
||||||
|
"use_auth_token",
|
||||||
|
"revision",
|
||||||
|
"subfolder",
|
||||||
|
"_raise_exceptions_for_missing_entries",
|
||||||
|
"_commit_hash"
|
||||||
|
]
|
||||||
|
# TODO: do we need this filtering of kwargs? @PanQiWei is there a reason we can't just pass all kwargs?
|
||||||
|
keywords = {
|
||||||
|
key: kwargs[key]
|
||||||
|
for key in list(signature(quant_func).parameters.keys()) + huggingface_kwargs
|
||||||
|
if key in kwargs
|
||||||
|
}
|
||||||
return quant_func(
|
return quant_func(
|
||||||
model_name_or_path=model_name_or_path,
|
model_name_or_path=model_name_or_path,
|
||||||
save_dir=save_dir,
|
|
||||||
device_map=device_map,
|
device_map=device_map,
|
||||||
max_memory=max_memory,
|
max_memory=max_memory,
|
||||||
device=device,
|
device=device,
|
||||||
|
@ -96,6 +120,7 @@ class AutoGPTQForCausalLM:
|
||||||
use_safetensors=use_safetensors,
|
use_safetensors=use_safetensors,
|
||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
warmup_triton=warmup_triton,
|
warmup_triton=warmup_triton,
|
||||||
|
trainable=trainable,
|
||||||
**keywords
|
**keywords
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
16
auto_gptq/modeling/baichuan.py
Normal file
16
auto_gptq/modeling/baichuan.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from ._base import *
|
||||||
|
|
||||||
|
|
||||||
|
class BaiChuanGPTQForCausalLM(BaseGPTQForCausalLM):
|
||||||
|
layer_type = "DecoderLayer"
|
||||||
|
layers_block_name = "model.layers"
|
||||||
|
outside_layer_modules = ["model.embed_tokens", "model.norm"]
|
||||||
|
inside_layer_modules = [
|
||||||
|
["self_attn.W_pack"],
|
||||||
|
["self_attn.o_proj"],
|
||||||
|
["mlp.up_proj", "mlp.gate_proj"],
|
||||||
|
["mlp.down_proj"]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["BaiChuanGPTQForCausalLM"]
|
16
auto_gptq/modeling/internlm.py
Normal file
16
auto_gptq/modeling/internlm.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
from ._base import *
|
||||||
|
|
||||||
|
|
||||||
|
class InternLMGPTQForCausalLM(BaseGPTQForCausalLM):
|
||||||
|
layer_type = "InternLMDecoderLayer"
|
||||||
|
layers_block_name = "model.layers"
|
||||||
|
outside_layer_modules = ["model.embed_tokens", "model.norm"]
|
||||||
|
inside_layer_modules = [
|
||||||
|
["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
|
||||||
|
["self_attn.o_proj"],
|
||||||
|
["mlp.up_proj", "mlp.gate_proj"],
|
||||||
|
["mlp.down_proj"],
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["InternLMGPTQForCausalLM"]
|
|
@ -18,7 +18,16 @@ class FusedBaseModule(nn.Module, TritonModuleMixin):
|
||||||
class FusedBaseAttentionModule(FusedBaseModule):
|
class FusedBaseAttentionModule(FusedBaseModule):
|
||||||
@classmethod
|
@classmethod
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def inject_to_model(cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, **kwargs):
|
def inject_to_model(
|
||||||
|
cls,
|
||||||
|
model,
|
||||||
|
use_triton=False,
|
||||||
|
group_size=-1,
|
||||||
|
use_cuda_fp16=True,
|
||||||
|
desc_act=False,
|
||||||
|
trainable=False,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
|
@ -226,7 +226,16 @@ class FusedGPTJAttentionForQuantizedModel(FusedBaseAttentionModule):
|
||||||
return outputs # a, present, (attentions)
|
return outputs # a, present, (attentions)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def inject_to_model(cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, **kwargs):
|
def inject_to_model(
|
||||||
|
cls,
|
||||||
|
model,
|
||||||
|
use_triton=False,
|
||||||
|
group_size=-1,
|
||||||
|
use_cuda_fp16=True,
|
||||||
|
desc_act=False,
|
||||||
|
trainable=False,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
config = model.config
|
config = model.config
|
||||||
QuantLinear = dynamically_import_QuantLinear(use_triton=use_triton, desc_act=desc_act, group_size=group_size)
|
QuantLinear = dynamically_import_QuantLinear(use_triton=use_triton, desc_act=desc_act, group_size=group_size)
|
||||||
|
|
||||||
|
@ -253,7 +262,7 @@ class FusedGPTJAttentionForQuantizedModel(FusedBaseAttentionModule):
|
||||||
q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
|
q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
|
||||||
True if q_proj.bias is not None else False,
|
True if q_proj.bias is not None else False,
|
||||||
)
|
)
|
||||||
qlinear_kwargs = dict()
|
qlinear_kwargs = {"trainable": trainable}
|
||||||
if (not desc_act or group_size == -1) and not use_triton:
|
if (not desc_act or group_size == -1) and not use_triton:
|
||||||
qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
|
qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
|
||||||
qkv_proj = QuantLinear(*qlinear_args, **qlinear_kwargs)
|
qkv_proj = QuantLinear(*qlinear_args, **qlinear_kwargs)
|
||||||
|
|
|
@ -126,7 +126,16 @@ class FusedLlamaAttentionForQuantizedModel(FusedBaseAttentionModule):
|
||||||
return attn_output, attn_weights, past_key_value
|
return attn_output, attn_weights, past_key_value
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def inject_to_model(cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, **kwargs):
|
def inject_to_model(
|
||||||
|
cls,
|
||||||
|
model,
|
||||||
|
use_triton=False,
|
||||||
|
group_size=-1,
|
||||||
|
use_cuda_fp16=True,
|
||||||
|
desc_act=False,
|
||||||
|
trainable=False,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Replace all LlamaAttention modules with QuantLlamaAttention modules, fusing the q, k, v projections.
|
Replace all LlamaAttention modules with QuantLlamaAttention modules, fusing the q, k, v projections.
|
||||||
"""
|
"""
|
||||||
|
@ -153,7 +162,7 @@ class FusedLlamaAttentionForQuantizedModel(FusedBaseAttentionModule):
|
||||||
q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
|
q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
|
||||||
True if q_proj.bias is not None else False,
|
True if q_proj.bias is not None else False,
|
||||||
)
|
)
|
||||||
qlinear_kwargs = dict()
|
qlinear_kwargs = {"trainable": trainable}
|
||||||
if (not desc_act or group_size == -1) and not use_triton:
|
if (not desc_act or group_size == -1) and not use_triton:
|
||||||
qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
|
qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
|
||||||
qkv_layer = QuantLinear(*qlinear_args, **qlinear_kwargs)
|
qkv_layer = QuantLinear(*qlinear_args, **qlinear_kwargs)
|
||||||
|
|
|
@ -237,14 +237,6 @@ class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
|
||||||
up_proj,
|
up_proj,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.register_buffer('gate_proj_qweight', gate_proj.qweight)
|
|
||||||
self.register_buffer('gate_proj_scales', gate_proj.scales)
|
|
||||||
self.register_buffer('gate_proj_qzeros', gate_proj.qzeros)
|
|
||||||
self.register_buffer('gate_proj_g_idx', gate_proj.g_idx)
|
|
||||||
self.register_buffer('up_proj_qweight', up_proj.qweight)
|
|
||||||
self.register_buffer('up_proj_scales', up_proj.scales)
|
|
||||||
self.register_buffer('up_proj_qzeros', up_proj.qzeros)
|
|
||||||
self.register_buffer('up_proj_g_idx', up_proj.g_idx)
|
|
||||||
|
|
||||||
self.infeatures = gate_proj.infeatures
|
self.infeatures = gate_proj.infeatures
|
||||||
self.intermediate_size = gate_proj.outfeatures
|
self.intermediate_size = gate_proj.outfeatures
|
||||||
|
@ -252,6 +244,8 @@ class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
|
||||||
self.bits = gate_proj.bits
|
self.bits = gate_proj.bits
|
||||||
self.maxq = gate_proj.maxq
|
self.maxq = gate_proj.maxq
|
||||||
|
|
||||||
|
self.gate_proj = gate_proj
|
||||||
|
self.up_proj = up_proj
|
||||||
self.down_proj = down_proj
|
self.down_proj = down_proj
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
|
@ -266,40 +260,20 @@ class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
|
||||||
c = torch.empty((M, N), device=x.device, dtype=torch.float16)
|
c = torch.empty((M, N), device=x.device, dtype=torch.float16)
|
||||||
grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), )
|
grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), )
|
||||||
quant_fused_matmul_248_kernel[grid](
|
quant_fused_matmul_248_kernel[grid](
|
||||||
x, c, self.gate_proj_qweight,
|
x, c, self.gate_proj.qweight,
|
||||||
self.gate_proj_scales, self.gate_proj_qzeros, self.gate_proj_g_idx,
|
self.gate_proj.scales, self.gate_proj.qzeros, self.gate_proj.g_idx,
|
||||||
self.up_proj_qweight,
|
self.up_proj.qweight,
|
||||||
self.up_proj_scales, self.up_proj_qzeros, self.up_proj_g_idx,
|
self.up_proj.scales, self.up_proj.qzeros, self.up_proj.g_idx,
|
||||||
M, N, K,
|
M, N, K,
|
||||||
self.bits, self.maxq,
|
self.bits, self.maxq,
|
||||||
x.stride(0), x.stride(1),
|
x.stride(0), x.stride(1),
|
||||||
self.gate_proj_qweight.stride(0), self.gate_proj_qweight.stride(1),
|
self.gate_proj.qweight.stride(0), self.gate_proj.qweight.stride(1),
|
||||||
c.stride(0), c.stride(1),
|
c.stride(0), c.stride(1),
|
||||||
self.gate_proj_scales.stride(0), self.gate_proj_qzeros.stride(0)
|
self.gate_proj.scales.stride(0), self.gate_proj.qzeros.stride(0)
|
||||||
)
|
)
|
||||||
c = c.reshape(out_shape)
|
c = c.reshape(out_shape)
|
||||||
return c
|
return c
|
||||||
|
|
||||||
def fused2cuda(self):
|
|
||||||
self.gate_proj_qweight = self.gate_proj_qweight.cuda()
|
|
||||||
self.gate_proj_scales = self.gate_proj_scales.cuda()
|
|
||||||
self.gate_proj_qzeros = self.gate_proj_qzeros.cuda()
|
|
||||||
self.gate_proj_g_idx = self.gate_proj_g_idx.cuda()
|
|
||||||
self.up_proj_qweight = self.up_proj_qweight.cuda()
|
|
||||||
self.up_proj_scales = self.up_proj_scales.cuda()
|
|
||||||
self.up_proj_qzeros = self.up_proj_qzeros.cuda()
|
|
||||||
self.up_proj_g_idx = self.up_proj_g_idx.cuda()
|
|
||||||
|
|
||||||
def fused2cpu(self):
|
|
||||||
self.gate_proj_qweight = self.gate_proj_qweight.cpu()
|
|
||||||
self.gate_proj_scales = self.gate_proj_scales.cpu()
|
|
||||||
self.gate_proj_qzeros = self.gate_proj_qzeros.cpu()
|
|
||||||
self.gate_proj_g_idx = self.gate_proj_g_idx.cpu()
|
|
||||||
self.up_proj_qweight = self.up_proj_qweight.cpu()
|
|
||||||
self.up_proj_scales = self.up_proj_scales.cpu()
|
|
||||||
self.up_proj_qzeros = self.up_proj_qzeros.cpu()
|
|
||||||
self.up_proj_g_idx = self.up_proj_g_idx.cpu()
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def inject_to_model(cls, model, use_triton=False, **kwargs):
|
def inject_to_model(cls, model, use_triton=False, **kwargs):
|
||||||
if not use_triton:
|
if not use_triton:
|
||||||
|
|
57
auto_gptq/nn_modules/qlinear/__init__.py
Normal file
57
auto_gptq/nn_modules/qlinear/__init__.py
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class GeneralQuantLinear(nn.Linear):
|
||||||
|
def __init__(self, quant_linear_module):
|
||||||
|
super().__init__(
|
||||||
|
in_features=quant_linear_module.infeatures,
|
||||||
|
out_features=quant_linear_module.outfeatures,
|
||||||
|
bias=True
|
||||||
|
)
|
||||||
|
|
||||||
|
self.infeatures = quant_linear_module.infeatures
|
||||||
|
self.outfeatures = quant_linear_module.outfeatures
|
||||||
|
self.bits = quant_linear_module.bits
|
||||||
|
self.group_size = quant_linear_module.group_size
|
||||||
|
self.maxq = quant_linear_module.maxq
|
||||||
|
|
||||||
|
self.weight.requires_grad = False
|
||||||
|
|
||||||
|
self.weight.data = quant_linear_module.qweight
|
||||||
|
self.qweight = self.weight
|
||||||
|
self.bias.data = quant_linear_module.bias
|
||||||
|
|
||||||
|
self.qweight.requires_grad = False
|
||||||
|
self.bias.requires_grad = False
|
||||||
|
|
||||||
|
self.qzeros = quant_linear_module.qzeros
|
||||||
|
self.scales = quant_linear_module.scales
|
||||||
|
self.g_idx = quant_linear_module.g_idx
|
||||||
|
|
||||||
|
if hasattr(quant_linear_module, "wf"):
|
||||||
|
self.wf = quant_linear_module.wf
|
||||||
|
if hasattr(quant_linear_module, "kernel_switch_threshold"):
|
||||||
|
self.kernel_switch_threshold = quant_linear_module.kernel_switch_threshold
|
||||||
|
if hasattr(quant_linear_module, "autogptq_cuda_available"):
|
||||||
|
self.autogptq_cuda_available = quant_linear_module.autogptq_cuda_available
|
||||||
|
|
||||||
|
self.trainable = quant_linear_module.trainable
|
||||||
|
|
||||||
|
self.forward = quant_linear_module.forward
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def inject_to_model(cls, model, target_module_type):
|
||||||
|
for name, m in model.named_modules():
|
||||||
|
if not isinstance(m, target_module_type):
|
||||||
|
continue
|
||||||
|
new_m = cls(m)
|
||||||
|
if '.' in name:
|
||||||
|
parent_name = name.rsplit('.', 1)[0]
|
||||||
|
child_name = name[len(parent_name) + 1:]
|
||||||
|
parent = model.get_submodule(parent_name)
|
||||||
|
else:
|
||||||
|
parent_name = ''
|
||||||
|
parent = model
|
||||||
|
child_name = name
|
||||||
|
|
||||||
|
setattr(parent, child_name, new_m)
|
|
@ -9,11 +9,13 @@ import transformers
|
||||||
logger = getLogger(__name__)
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import autogptq_cuda
|
import autogptq_cuda_256
|
||||||
|
import autogptq_cuda_64
|
||||||
_autogptq_cuda_available = True
|
_autogptq_cuda_available = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning('CUDA extension not installed.')
|
logger.warning('CUDA extension not installed.')
|
||||||
|
autogptq_cuda_256 = None
|
||||||
|
autogptq_cuda_64 = None
|
||||||
_autogptq_cuda_available = False
|
_autogptq_cuda_available = False
|
||||||
|
|
||||||
|
|
||||||
|
@ -26,10 +28,14 @@ class QuantLinear(nn.Module):
|
||||||
outfeatures,
|
outfeatures,
|
||||||
bias,
|
bias,
|
||||||
kernel_switch_threshold=128,
|
kernel_switch_threshold=128,
|
||||||
|
trainable=False
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
global _autogptq_cuda_available
|
||||||
if bits not in [2, 3, 4, 8]:
|
if bits not in [2, 3, 4, 8]:
|
||||||
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
||||||
|
if trainable:
|
||||||
|
_autogptq_cuda_available = False
|
||||||
|
|
||||||
self.infeatures = infeatures
|
self.infeatures = infeatures
|
||||||
self.outfeatures = outfeatures
|
self.outfeatures = outfeatures
|
||||||
|
@ -73,9 +79,15 @@ class QuantLinear(nn.Module):
|
||||||
|
|
||||||
self.kernel_switch_threshold = kernel_switch_threshold
|
self.kernel_switch_threshold = kernel_switch_threshold
|
||||||
self.autogptq_cuda_available = _autogptq_cuda_available
|
self.autogptq_cuda_available = _autogptq_cuda_available
|
||||||
|
|
||||||
|
self.autogptq_cuda = autogptq_cuda_256
|
||||||
if infeatures % 256 != 0 or outfeatures % 256 != 0:
|
if infeatures % 256 != 0 or outfeatures % 256 != 0:
|
||||||
|
self.autogptq_cuda = autogptq_cuda_64
|
||||||
|
if infeatures % 64 != 0 or outfeatures % 64 != 0:
|
||||||
self.autogptq_cuda_available = False
|
self.autogptq_cuda_available = False
|
||||||
|
|
||||||
|
self.trainable = trainable
|
||||||
|
|
||||||
def pack(self, linear, scales, zeros, g_idx=None):
|
def pack(self, linear, scales, zeros, g_idx=None):
|
||||||
W = linear.weight.data.clone()
|
W = linear.weight.data.clone()
|
||||||
if isinstance(linear, nn.Conv2d):
|
if isinstance(linear, nn.Conv2d):
|
||||||
|
@ -184,13 +196,13 @@ class QuantLinear(nn.Module):
|
||||||
):
|
):
|
||||||
out = torch.zeros((x.shape[0], self.outfeatures), device=x.device, dtype=torch.float32)
|
out = torch.zeros((x.shape[0], self.outfeatures), device=x.device, dtype=torch.float32)
|
||||||
if self.bits == 2:
|
if self.bits == 2:
|
||||||
autogptq_cuda.vecquant2matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
|
self.autogptq_cuda.vecquant2matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
|
||||||
elif self.bits == 3:
|
elif self.bits == 3:
|
||||||
autogptq_cuda.vecquant3matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
|
self.autogptq_cuda.vecquant3matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
|
||||||
elif self.bits == 4:
|
elif self.bits == 4:
|
||||||
autogptq_cuda.vecquant4matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
|
self.autogptq_cuda.vecquant4matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
|
||||||
elif self.bits == 8:
|
elif self.bits == 8:
|
||||||
autogptq_cuda.vecquant8matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
|
self.autogptq_cuda.vecquant8matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
||||||
else:
|
else:
|
|
@ -7,15 +7,17 @@ import torch.nn as nn
|
||||||
import transformers
|
import transformers
|
||||||
|
|
||||||
logger = getLogger(__name__)
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import autogptq_cuda
|
import autogptq_cuda_256
|
||||||
|
import autogptq_cuda_64
|
||||||
_autogptq_cuda_available = True
|
_autogptq_cuda_available = True
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning('CUDA extension not installed.')
|
logger.warning('CUDA extension not installed.')
|
||||||
|
autogptq_cuda_256 = None
|
||||||
|
autogptq_cuda_64 = None
|
||||||
_autogptq_cuda_available = False
|
_autogptq_cuda_available = False
|
||||||
|
|
||||||
|
|
||||||
class QuantLinear(nn.Module):
|
class QuantLinear(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -25,12 +27,15 @@ class QuantLinear(nn.Module):
|
||||||
outfeatures,
|
outfeatures,
|
||||||
bias,
|
bias,
|
||||||
use_cuda_fp16=True,
|
use_cuda_fp16=True,
|
||||||
kernel_switch_threshold=128
|
kernel_switch_threshold=128,
|
||||||
|
trainable=False
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
global _autogptq_cuda_available
|
||||||
if bits not in [2, 3, 4, 8]:
|
if bits not in [2, 3, 4, 8]:
|
||||||
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
||||||
|
if trainable:
|
||||||
|
_autogptq_cuda_available = False
|
||||||
self.infeatures = infeatures
|
self.infeatures = infeatures
|
||||||
self.outfeatures = outfeatures
|
self.outfeatures = outfeatures
|
||||||
self.bits = bits
|
self.bits = bits
|
||||||
|
@ -77,10 +82,21 @@ class QuantLinear(nn.Module):
|
||||||
|
|
||||||
self.kernel_switch_threshold = kernel_switch_threshold
|
self.kernel_switch_threshold = kernel_switch_threshold
|
||||||
self.autogptq_cuda_available = _autogptq_cuda_available
|
self.autogptq_cuda_available = _autogptq_cuda_available
|
||||||
|
self.autogptq_cuda = autogptq_cuda_256
|
||||||
if infeatures % 256 != 0 or outfeatures % 256 != 0:
|
if infeatures % 256 != 0 or outfeatures % 256 != 0:
|
||||||
|
self.autogptq_cuda = autogptq_cuda_64
|
||||||
|
if infeatures % 64 != 0 or outfeatures % 64 != 0:
|
||||||
self.autogptq_cuda_available = False
|
self.autogptq_cuda_available = False
|
||||||
|
|
||||||
|
self.trainable = trainable
|
||||||
|
|
||||||
def pack(self, linear, scales, zeros, g_idx):
|
def pack(self, linear, scales, zeros, g_idx):
|
||||||
|
W = linear.weight.data.clone()
|
||||||
|
if isinstance(linear, nn.Conv2d):
|
||||||
|
W = W.flatten(1)
|
||||||
|
if isinstance(linear, transformers.pytorch_utils.Conv1D):
|
||||||
|
W = W.t()
|
||||||
|
|
||||||
scales = scales.t().contiguous()
|
scales = scales.t().contiguous()
|
||||||
zeros = zeros.t().contiguous()
|
zeros = zeros.t().contiguous()
|
||||||
scale_zeros = zeros * scales
|
scale_zeros = zeros * scales
|
||||||
|
@ -93,7 +109,7 @@ class QuantLinear(nn.Module):
|
||||||
g_idx = idx // self.group_size
|
g_idx = idx // self.group_size
|
||||||
intweight.append(
|
intweight.append(
|
||||||
torch.round(
|
torch.round(
|
||||||
(linear.weight.data[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]
|
(W[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]
|
||||||
).to(torch.int)[:, None]
|
).to(torch.int)[:, None]
|
||||||
)
|
)
|
||||||
intweight = torch.cat(intweight, dim=1)
|
intweight = torch.cat(intweight, dim=1)
|
||||||
|
@ -182,24 +198,24 @@ class QuantLinear(nn.Module):
|
||||||
if self.use_cuda_fp16:
|
if self.use_cuda_fp16:
|
||||||
x = x.half()
|
x = x.half()
|
||||||
if self.bits == 2:
|
if self.bits == 2:
|
||||||
autogptq_cuda.vecquant2matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
|
self.autogptq_cuda.vecquant2matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
|
||||||
elif self.bits == 3:
|
elif self.bits == 3:
|
||||||
autogptq_cuda.vecquant3matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
|
self.autogptq_cuda.vecquant3matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
|
||||||
elif self.bits == 4:
|
elif self.bits == 4:
|
||||||
autogptq_cuda.vecquant4matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
|
self.autogptq_cuda.vecquant4matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError("Only 2,3,4 bits are supported.")
|
raise NotImplementedError("Only 2,3,4 bits are supported.")
|
||||||
else:
|
else:
|
||||||
x = x.float()
|
x = x.float()
|
||||||
if self.bits == 2:
|
if self.bits == 2:
|
||||||
autogptq_cuda.vecquant2matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
|
self.autogptq_cuda.vecquant2matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
|
||||||
elif self.bits == 3:
|
elif self.bits == 3:
|
||||||
autogptq_cuda.vecquant3matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
|
self.autogptq_cuda.vecquant3matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
|
||||||
elif self.bits == 4:
|
elif self.bits == 4:
|
||||||
autogptq_cuda.vecquant4matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
|
self.autogptq_cuda.vecquant4matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
|
||||||
elif self.bits == 8:
|
elif self.bits == 8:
|
||||||
autogptq_cuda.vecquant8matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
|
self.autogptq_cuda.vecquant8matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
|
||||||
else:
|
else:
|
|
@ -1,17 +1,20 @@
|
||||||
import math
|
import math
|
||||||
|
from logging import getLogger
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import transformers
|
import transformers
|
||||||
from torch.cuda.amp import custom_bwd, custom_fwd
|
|
||||||
from logging import getLogger
|
|
||||||
|
|
||||||
from .triton_utils.mixin import TritonModuleMixin
|
from ..triton_utils.mixin import TritonModuleMixin
|
||||||
|
|
||||||
logger = getLogger(__name__)
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from .triton_utils.kernels import quant_matmul_248, transpose_quant_matmul_248, QuantLinearFunction
|
from ..triton_utils.kernels import (
|
||||||
|
quant_matmul_248, transpose_quant_matmul_248, quant_matmul_inference_only_248,
|
||||||
|
QuantLinearFunction, QuantLinearInferenceOnlyFunction
|
||||||
|
)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.error('triton not installed.')
|
logger.error('triton not installed.')
|
||||||
raise
|
raise
|
||||||
|
@ -24,13 +27,14 @@ class QuantLinear(nn.Module, TritonModuleMixin):
|
||||||
group_size,
|
group_size,
|
||||||
infeatures,
|
infeatures,
|
||||||
outfeatures,
|
outfeatures,
|
||||||
bias
|
bias,
|
||||||
|
trainable=False
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
if bits not in [2, 4, 8]:
|
if bits not in [2, 4, 8]:
|
||||||
raise NotImplementedError("Only 2,4,8 bits are supported.")
|
raise NotImplementedError("Only 2,4,8 bits are supported.")
|
||||||
if infeatures % 256 != 0 or outfeatures % 256 != 0:
|
if infeatures % 32 != 0 or outfeatures % 32 != 0:
|
||||||
raise NotImplementedError("in_feature or out_feature must be divisible by 256.")
|
raise NotImplementedError("in_feature and out_feature must be divisible by 32.")
|
||||||
self.infeatures = infeatures
|
self.infeatures = infeatures
|
||||||
self.outfeatures = outfeatures
|
self.outfeatures = outfeatures
|
||||||
self.bits = bits
|
self.bits = bits
|
||||||
|
@ -58,6 +62,8 @@ class QuantLinear(nn.Module, TritonModuleMixin):
|
||||||
else:
|
else:
|
||||||
self.bias = None
|
self.bias = None
|
||||||
|
|
||||||
|
self.trainable = trainable
|
||||||
|
|
||||||
def pack(self, linear, scales, zeros, g_idx=None):
|
def pack(self, linear, scales, zeros, g_idx=None):
|
||||||
W = linear.weight.data.clone()
|
W = linear.weight.data.clone()
|
||||||
if isinstance(linear, nn.Conv2d):
|
if isinstance(linear, nn.Conv2d):
|
||||||
|
@ -122,7 +128,8 @@ class QuantLinear(nn.Module, TritonModuleMixin):
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
out_shape = x.shape[:-1] + (self.outfeatures,)
|
out_shape = x.shape[:-1] + (self.outfeatures,)
|
||||||
out = QuantLinearFunction.apply(
|
quant_linear_fn = QuantLinearFunction if self.trainable else QuantLinearInferenceOnlyFunction
|
||||||
|
out = quant_linear_fn.apply(
|
||||||
x.reshape(-1, x.shape[-1]),
|
x.reshape(-1, x.shape[-1]),
|
||||||
self.qweight,
|
self.qweight,
|
||||||
self.scales,
|
self.scales,
|
||||||
|
@ -160,11 +167,14 @@ class QuantLinear(nn.Module, TritonModuleMixin):
|
||||||
for m in tqdm(range(0, math.ceil(math.log2(seqlen)) + 1)):
|
for m in tqdm(range(0, math.ceil(math.log2(seqlen)) + 1)):
|
||||||
m = 2 ** m
|
m = 2 ** m
|
||||||
for (k, n), (qweight, scales, qzeros, g_idx, bits, maxq) in kn_values.items():
|
for (k, n), (qweight, scales, qzeros, g_idx, bits, maxq) in kn_values.items():
|
||||||
a = torch.randn(m, k, dtype=torch.float16, device=model.device)
|
|
||||||
quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
|
|
||||||
if transpose:
|
if transpose:
|
||||||
|
a = torch.randn(m, k, dtype=torch.float16, device=model.device)
|
||||||
|
quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
|
||||||
a = torch.randn(m, n, dtype=torch.float16, device=model.device)
|
a = torch.randn(m, n, dtype=torch.float16, device=model.device)
|
||||||
transpose_quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
|
transpose_quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
|
||||||
|
else:
|
||||||
|
a = torch.randn(m, k, dtype=torch.float16, device=model.device)
|
||||||
|
quant_matmul_inference_only_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
|
||||||
del kn_values
|
del kn_values
|
||||||
|
|
||||||
|
|
|
@ -73,27 +73,7 @@ logger = getLogger(__name__)
|
||||||
},
|
},
|
||||||
num_stages=2,
|
num_stages=2,
|
||||||
num_warps=8
|
num_warps=8
|
||||||
),
|
)
|
||||||
triton.Config(
|
|
||||||
{
|
|
||||||
'BLOCK_SIZE_M': 64,
|
|
||||||
'BLOCK_SIZE_N': 64,
|
|
||||||
'BLOCK_SIZE_K': 64,
|
|
||||||
'GROUP_SIZE_M': 8
|
|
||||||
},
|
|
||||||
num_stages=3,
|
|
||||||
num_warps=8
|
|
||||||
),
|
|
||||||
triton.Config(
|
|
||||||
{
|
|
||||||
'BLOCK_SIZE_M': 32,
|
|
||||||
'BLOCK_SIZE_N': 32,
|
|
||||||
'BLOCK_SIZE_K': 128,
|
|
||||||
'GROUP_SIZE_M': 8
|
|
||||||
},
|
|
||||||
num_stages=2,
|
|
||||||
num_warps=4
|
|
||||||
),
|
|
||||||
],
|
],
|
||||||
key=['M', 'N', 'K'],
|
key=['M', 'N', 'K'],
|
||||||
nearest_power_of_two=True,
|
nearest_power_of_two=True,
|
||||||
|
@ -244,27 +224,7 @@ def quant_matmul_248_kernel(
|
||||||
},
|
},
|
||||||
num_stages=2,
|
num_stages=2,
|
||||||
num_warps=8
|
num_warps=8
|
||||||
),
|
)
|
||||||
triton.Config(
|
|
||||||
{
|
|
||||||
'BLOCK_SIZE_M': 64,
|
|
||||||
'BLOCK_SIZE_N': 64,
|
|
||||||
'BLOCK_SIZE_K': 64,
|
|
||||||
'GROUP_SIZE_M': 8
|
|
||||||
},
|
|
||||||
num_stages=3,
|
|
||||||
num_warps=8
|
|
||||||
),
|
|
||||||
triton.Config(
|
|
||||||
{
|
|
||||||
'BLOCK_SIZE_M': 32,
|
|
||||||
'BLOCK_SIZE_N': 128,
|
|
||||||
'BLOCK_SIZE_K': 32,
|
|
||||||
'GROUP_SIZE_M': 8
|
|
||||||
},
|
|
||||||
num_stages=2,
|
|
||||||
num_warps=4
|
|
||||||
),
|
|
||||||
],
|
],
|
||||||
key=['M', 'N', 'K'],
|
key=['M', 'N', 'K'],
|
||||||
nearest_power_of_two=True
|
nearest_power_of_two=True
|
||||||
|
@ -356,7 +316,6 @@ def silu(x):
|
||||||
return x * tl.sigmoid(x)
|
return x * tl.sigmoid(x)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
|
def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
|
||||||
with torch.cuda.device(input.device):
|
with torch.cuda.device(input.device):
|
||||||
output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=input.dtype)
|
output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=input.dtype)
|
||||||
|
@ -414,3 +373,30 @@ class QuantLinearFunction(torch.autograd.Function):
|
||||||
if ctx.needs_input_grad[0]:
|
if ctx.needs_input_grad[0]:
|
||||||
grad_input = transpose_quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
|
grad_input = transpose_quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
|
||||||
return grad_input, None, None, None, None, None, None
|
return grad_input, None, None, None, None, None, None
|
||||||
|
|
||||||
|
|
||||||
|
def quant_matmul_inference_only_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
|
||||||
|
with torch.cuda.device(input.device):
|
||||||
|
output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16)
|
||||||
|
grid = lambda META: (
|
||||||
|
triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(qweight.shape[1], META['BLOCK_SIZE_N']),
|
||||||
|
)
|
||||||
|
quant_matmul_248_kernel[grid](
|
||||||
|
input, qweight, output,
|
||||||
|
scales, qzeros, g_idx,
|
||||||
|
input.shape[0], qweight.shape[1], input.shape[1],
|
||||||
|
bits, maxq,
|
||||||
|
input.stride(0), input.stride(1),
|
||||||
|
qweight.stride(0), qweight.stride(1),
|
||||||
|
output.stride(0), output.stride(1),
|
||||||
|
scales.stride(0), qzeros.stride(0)
|
||||||
|
)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class QuantLinearInferenceOnlyFunction(torch.autograd.Function):
|
||||||
|
@staticmethod
|
||||||
|
@custom_fwd(cast_inputs=torch.float16)
|
||||||
|
def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
|
||||||
|
output = quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq)
|
||||||
|
return output
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
from .perplexity_utils import Perplexity
|
|
@ -7,15 +7,22 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
TRITON_AVAILABLE = False
|
TRITON_AVAILABLE = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
import autogptq_cuda
|
||||||
|
|
||||||
|
AUTOGPTQ_CUDA_AVAILABLE = True
|
||||||
|
except:
|
||||||
|
AUTOGPTQ_CUDA_AVAILABLE = False
|
||||||
|
|
||||||
|
|
||||||
def dynamically_import_QuantLinear(use_triton: bool, desc_act: bool, group_size: int):
|
def dynamically_import_QuantLinear(use_triton: bool, desc_act: bool, group_size: int):
|
||||||
if use_triton:
|
if use_triton:
|
||||||
from ..nn_modules.qlinear_triton import QuantLinear
|
from ..nn_modules.qlinear.qlinear_triton import QuantLinear
|
||||||
else:
|
else:
|
||||||
if not desc_act or group_size == -1:
|
if not desc_act or group_size == -1:
|
||||||
from ..nn_modules.qlinear_old import QuantLinear
|
from ..nn_modules.qlinear.qlinear_cuda_old import QuantLinear
|
||||||
else:
|
else:
|
||||||
from ..nn_modules.qlinear import QuantLinear
|
from ..nn_modules.qlinear.qlinear_cuda import QuantLinear
|
||||||
|
|
||||||
return QuantLinear
|
return QuantLinear
|
||||||
|
|
||||||
|
|
423
auto_gptq/utils/peft_utils.py
Normal file
423
auto_gptq/utils/peft_utils.py
Normal file
|
@ -0,0 +1,423 @@
|
||||||
|
import warnings
|
||||||
|
import re
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from dataclasses import asdict
|
||||||
|
from enum import Enum
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from peft import get_peft_model, PeftConfig, PeftModel, PeftType
|
||||||
|
from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING
|
||||||
|
from peft.tuners.lora import LoraConfig, LoraLayer, LoraModel, Embedding
|
||||||
|
from peft.tuners.adalora import AdaLoraConfig, AdaLoraLayer, AdaLoraModel
|
||||||
|
from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING
|
||||||
|
from peft.utils.other import _get_submodules
|
||||||
|
|
||||||
|
from ..modeling._base import BaseGPTQForCausalLM
|
||||||
|
|
||||||
|
|
||||||
|
class GPTQLoraConfig(LoraConfig):
|
||||||
|
injected_fused_attention: bool = False
|
||||||
|
injected_fused_mlp: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class GPTQLoraLinear(torch.nn.Linear, LoraLayer):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
adapter_name: str,
|
||||||
|
linear_module: torch.nn.Linear,
|
||||||
|
r: int = 0,
|
||||||
|
lora_alpha: int = 1,
|
||||||
|
lora_dropout: float = 0.0,
|
||||||
|
fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
init_lora_weights = kwargs.pop("init_lora_weights", True)
|
||||||
|
|
||||||
|
torch.nn.Linear.__init__(self, linear_module.in_features, linear_module.out_features)
|
||||||
|
LoraLayer.__init__(self, linear_module.in_features, linear_module.out_features)
|
||||||
|
|
||||||
|
self.linear_module = linear_module
|
||||||
|
|
||||||
|
self.weight.requires_grad = False
|
||||||
|
self.weight = self.linear_module.weight
|
||||||
|
self.bias = self.linear_module.bias
|
||||||
|
self.fan_in_fan_out = fan_in_fan_out
|
||||||
|
if fan_in_fan_out:
|
||||||
|
self.weight.data = self.weight.data.T
|
||||||
|
|
||||||
|
self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
|
||||||
|
self.active_adapter = adapter_name
|
||||||
|
|
||||||
|
def reset_lora_parameters(self, adapter_name):
|
||||||
|
if adapter_name in self.lora_A.keys():
|
||||||
|
torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight)
|
||||||
|
torch.nn.init.zeros_(self.lora_B[adapter_name].weight)
|
||||||
|
|
||||||
|
def merge(self):
|
||||||
|
raise NotImplementedError("gptq model not support merge lora adapter")
|
||||||
|
|
||||||
|
def unmerge(self):
|
||||||
|
raise NotImplementedError("gptq model not support unmerge lora adapter")
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor):
|
||||||
|
previous_dtype = x.dtype
|
||||||
|
if self.active_adapter not in self.lora_A.keys():
|
||||||
|
return self.linear_module(x)
|
||||||
|
if self.disable_adapters:
|
||||||
|
if self.r[self.active_adapter] > 0 and self.merged:
|
||||||
|
self.unmerge()
|
||||||
|
result = self.linear_module(x)
|
||||||
|
elif self.r[self.active_adapter] > 0 and not self.merged:
|
||||||
|
result = self.linear_module(x)
|
||||||
|
|
||||||
|
lora_B = self.lora_B[self.active_adapter]
|
||||||
|
lora_A = self.lora_A[self.active_adapter]
|
||||||
|
lora_dropout = self.lora_dropout[self.active_adapter]
|
||||||
|
scale = self.scaling[self.active_adapter]
|
||||||
|
|
||||||
|
x = x.type_as(lora_A.weight.data)
|
||||||
|
adapter_result = (lora_B(lora_A(lora_dropout(x))) * scale).type_as(result)
|
||||||
|
result += adapter_result
|
||||||
|
else:
|
||||||
|
result = self.linear_module(x)
|
||||||
|
|
||||||
|
result = result.to(previous_dtype)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
class GPTQLoraModel(LoraModel):
|
||||||
|
def _find_and_replace(self, adapter_name):
|
||||||
|
lora_config = self.peft_config[adapter_name]
|
||||||
|
is_target_modules_in_base_model = False
|
||||||
|
kwargs = {
|
||||||
|
"r": lora_config.r,
|
||||||
|
"lora_alpha": lora_config.lora_alpha,
|
||||||
|
"lora_dropout": lora_config.lora_dropout,
|
||||||
|
"fan_in_fan_out": lora_config.fan_in_fan_out,
|
||||||
|
"init_lora_weights": lora_config.init_lora_weights,
|
||||||
|
}
|
||||||
|
key_list = [key for key, _ in self.model.named_modules()]
|
||||||
|
for key in key_list:
|
||||||
|
if isinstance(lora_config.target_modules, str):
|
||||||
|
target_module_found = re.fullmatch(lora_config.target_modules, key)
|
||||||
|
else:
|
||||||
|
target_module_found = any(key.endswith(target_key) for target_key in lora_config.target_modules)
|
||||||
|
if target_module_found:
|
||||||
|
if not is_target_modules_in_base_model:
|
||||||
|
is_target_modules_in_base_model = True
|
||||||
|
parent, target, target_name = _get_submodules(self.model, key)
|
||||||
|
bias = False
|
||||||
|
if hasattr(target, "bias"):
|
||||||
|
bias = target.bias is not None
|
||||||
|
|
||||||
|
if isinstance(target, LoraLayer):
|
||||||
|
target.update_layer(
|
||||||
|
adapter_name,
|
||||||
|
lora_config.r,
|
||||||
|
lora_config.lora_alpha,
|
||||||
|
lora_config.lora_dropout,
|
||||||
|
lora_config.init_lora_weights,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if isinstance(target, torch.nn.Embedding):
|
||||||
|
embedding_kwargs = kwargs.copy()
|
||||||
|
embedding_kwargs.pop("fan_in_fan_out", None)
|
||||||
|
in_features, out_features = target.num_embeddings, target.embedding_dim
|
||||||
|
new_module = Embedding(adapter_name, in_features, out_features, **embedding_kwargs)
|
||||||
|
else:
|
||||||
|
if isinstance(target, torch.nn.Linear):
|
||||||
|
if kwargs["fan_in_fan_out"]:
|
||||||
|
warnings.warn(
|
||||||
|
"fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
|
||||||
|
"Setting fan_in_fan_out to False."
|
||||||
|
)
|
||||||
|
kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Target module {target} is not supported. "
|
||||||
|
f"Currently, only `torch.nn.Linear` and its subclasses are supported."
|
||||||
|
)
|
||||||
|
new_module = GPTQLoraLinear(adapter_name, target, **kwargs)
|
||||||
|
|
||||||
|
self._replace_module(parent, target_name, new_module, target)
|
||||||
|
if not is_target_modules_in_base_model:
|
||||||
|
raise ValueError(
|
||||||
|
f"Target modules {lora_config.target_modules} not found in the base model. "
|
||||||
|
f"Please check the target modules and try again."
|
||||||
|
)
|
||||||
|
|
||||||
|
def _replace_module(self, parent_module, child_name, new_module, old_module):
|
||||||
|
setattr(parent_module, child_name, new_module)
|
||||||
|
if not isinstance(new_module, GPTQLoraLinear):
|
||||||
|
new_module.weight = old_module.weight
|
||||||
|
if hasattr(old_module, "bias"):
|
||||||
|
if old_module.bias is not None:
|
||||||
|
new_module.bias = old_module.bias
|
||||||
|
|
||||||
|
if getattr(old_module, "state", None) is not None:
|
||||||
|
new_module.state = old_module.state
|
||||||
|
new_module.to(old_module.weight.device)
|
||||||
|
|
||||||
|
# dispatch to correct device
|
||||||
|
for name, module in new_module.named_modules():
|
||||||
|
if "lora_" in name:
|
||||||
|
module.to(old_module.weight.device)
|
||||||
|
|
||||||
|
def merge_adapter(self):
|
||||||
|
raise NotImplementedError("gptq model not support merge ada lora adapter")
|
||||||
|
|
||||||
|
def unmerge_adapter(self):
|
||||||
|
raise NotImplementedError("gptq model not support unmerge ada lora adapter")
|
||||||
|
|
||||||
|
def merge_and_unload(self):
|
||||||
|
raise NotImplementedError("gptq model not support merge and unload")
|
||||||
|
|
||||||
|
|
||||||
|
class GPTQAdaLoraConfig(AdaLoraConfig):
|
||||||
|
injected_fused_attention: bool = False
|
||||||
|
injected_fused_mlp: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class GPTQSVDLinear(torch.nn.Linear, AdaLoraLayer):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
adapter_name: str,
|
||||||
|
linear_module: torch.nn.Linear,
|
||||||
|
r: int = 0,
|
||||||
|
lora_alpha: int = 1,
|
||||||
|
lora_dropout: float = 0.0,
|
||||||
|
fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
init_lora_weights = kwargs.pop("init_lora_weights", True)
|
||||||
|
|
||||||
|
torch.nn.Linear.__init__(self, linear_module.in_features, linear_module.out_features)
|
||||||
|
AdaLoraLayer.__init__(self, linear_module.in_features, linear_module.out_features)
|
||||||
|
|
||||||
|
self.linear_module = linear_module
|
||||||
|
|
||||||
|
self.weight.requires_grad = False
|
||||||
|
self.weight = self.linear_module.weight
|
||||||
|
self.bias = self.linear_module.bias
|
||||||
|
self.fan_in_fan_out = fan_in_fan_out
|
||||||
|
if fan_in_fan_out:
|
||||||
|
self.weight.data = self.weight.data.T
|
||||||
|
|
||||||
|
self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
|
||||||
|
self.active_adapter = adapter_name
|
||||||
|
|
||||||
|
def merge(self):
|
||||||
|
raise NotImplementedError("gptq model not support merge lora adapter")
|
||||||
|
|
||||||
|
def unmerge(self):
|
||||||
|
raise NotImplementedError("gptq model not support unmerge lora adapter")
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor):
|
||||||
|
if self.active_adapter not in self.lora_A.keys():
|
||||||
|
return self.linear_module(x)
|
||||||
|
if self.disable_adapters:
|
||||||
|
if self.r[self.active_adapter] > 0 and self.merged:
|
||||||
|
self.unmerge()
|
||||||
|
result = self.linear_module(x)
|
||||||
|
elif self.r[self.active_adapter] > 0 and not self.merged:
|
||||||
|
result = self.linear_module(x)
|
||||||
|
result += (
|
||||||
|
(
|
||||||
|
self.lora_dropout[self.active_adapter](x)
|
||||||
|
@ (self.lora_A[self.active_adapter] * self.lora_E[self.active_adapter]).T
|
||||||
|
@ self.lora_B[self.active_adapter].T
|
||||||
|
)
|
||||||
|
* self.scaling[self.active_adapter]
|
||||||
|
/ (self.ranknum[self.active_adapter] + 1e-5)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
result = self.linear_module(x)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
class GPTQAdaLoraModel(AdaLoraModel):
|
||||||
|
def _find_and_replace(self, adapter_name):
|
||||||
|
lora_config = self.peft_config[adapter_name]
|
||||||
|
is_target_modules_in_base_model = False
|
||||||
|
kwargs = {
|
||||||
|
"r": lora_config.init_r,
|
||||||
|
"lora_alpha": lora_config.lora_alpha,
|
||||||
|
"lora_dropout": lora_config.lora_dropout,
|
||||||
|
"fan_in_fan_out": lora_config.fan_in_fan_out,
|
||||||
|
"init_lora_weights": lora_config.init_lora_weights,
|
||||||
|
}
|
||||||
|
key_list = [key for key, _ in self.model.named_modules()]
|
||||||
|
for key in key_list:
|
||||||
|
if isinstance(lora_config.target_modules, str):
|
||||||
|
target_module_found = re.fullmatch(lora_config.target_modules, key)
|
||||||
|
else:
|
||||||
|
target_module_found = any(key.endswith(target_key) for target_key in lora_config.target_modules)
|
||||||
|
if target_module_found:
|
||||||
|
if not is_target_modules_in_base_model:
|
||||||
|
is_target_modules_in_base_model = True
|
||||||
|
parent, target, target_name = _get_submodules(self.model, key)
|
||||||
|
bias = target.bias is not None
|
||||||
|
if isinstance(target, LoraLayer):
|
||||||
|
target.update_layer(
|
||||||
|
adapter_name,
|
||||||
|
lora_config.init_r,
|
||||||
|
lora_config.lora_alpha,
|
||||||
|
lora_config.lora_dropout,
|
||||||
|
lora_config.init_lora_weights,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if isinstance(target, torch.nn.Linear):
|
||||||
|
in_features, out_features = target.in_features, target.out_features
|
||||||
|
if kwargs["fan_in_fan_out"]:
|
||||||
|
warnings.warn(
|
||||||
|
"fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
|
||||||
|
"Setting fan_in_fan_out to False."
|
||||||
|
)
|
||||||
|
kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Target module {target} is not supported. "
|
||||||
|
f"Currently, only `torch.nn.Linear` and its subclasses are supported."
|
||||||
|
)
|
||||||
|
new_module = GPTQSVDLinear(adapter_name, target, **kwargs)
|
||||||
|
|
||||||
|
self._replace_module(parent, target_name, new_module, target)
|
||||||
|
if not is_target_modules_in_base_model:
|
||||||
|
raise ValueError(
|
||||||
|
f"Target modules {lora_config.target_modules} not found in the base model. "
|
||||||
|
f"Please check the target modules and try again."
|
||||||
|
)
|
||||||
|
|
||||||
|
def _replace_module(self, parent_module, child_name, new_module, old_module):
|
||||||
|
setattr(parent_module, child_name, new_module)
|
||||||
|
|
||||||
|
# dispatch to correct device
|
||||||
|
for name, module in new_module.named_modules():
|
||||||
|
if "lora_" in name:
|
||||||
|
module.to(old_module.weight.device)
|
||||||
|
|
||||||
|
def merge_adapter(self):
|
||||||
|
raise NotImplementedError("gptq model not support merge ada lora adapter")
|
||||||
|
|
||||||
|
def unmerge_adapter(self):
|
||||||
|
raise NotImplementedError("gptq model not support unmerge ada lora adapter")
|
||||||
|
|
||||||
|
def merge_and_unload(self):
|
||||||
|
raise NotImplementedError("gptq model not support merge and unload")
|
||||||
|
|
||||||
|
|
||||||
|
def find_all_linear_names(model: BaseGPTQForCausalLM, ignore: Optional[List[str]] = None, ignore_lm_head: bool = True):
|
||||||
|
if not ignore:
|
||||||
|
ignore = []
|
||||||
|
lm_head_name = model.lm_head_name
|
||||||
|
if ignore_lm_head and lm_head_name not in ignore:
|
||||||
|
ignore.append(lm_head_name)
|
||||||
|
results = set()
|
||||||
|
for n, m in model.named_modules():
|
||||||
|
if isinstance(m, torch.nn.Linear):
|
||||||
|
res = n.split('.')[-1]
|
||||||
|
if res not in ignore:
|
||||||
|
results.add(res)
|
||||||
|
return list(results)
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def hijack_peft_mappings():
|
||||||
|
PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
|
||||||
|
PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
|
||||||
|
PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
|
||||||
|
PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
|
||||||
|
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
except:
|
||||||
|
PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
|
||||||
|
PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
|
||||||
|
PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
|
||||||
|
PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
|
||||||
|
PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
|
||||||
|
PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
|
||||||
|
PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
|
||||||
|
|
||||||
|
|
||||||
|
def get_gptq_peft_model(
|
||||||
|
model: BaseGPTQForCausalLM,
|
||||||
|
peft_config: PeftConfig = None,
|
||||||
|
model_id: str = None,
|
||||||
|
adapter_name: str = "default",
|
||||||
|
auto_find_all_linears: bool = True,
|
||||||
|
train_mode: bool = False
|
||||||
|
):
|
||||||
|
if train_mode and not model.trainable:
|
||||||
|
model.enable_trainable_mode()
|
||||||
|
if train_mode and not peft_config:
|
||||||
|
raise ValueError("peft_config not specified when in train mode.")
|
||||||
|
if not train_mode and not model_id:
|
||||||
|
raise ValueError("model_id(where to load adapters) not specified when in inference mode.")
|
||||||
|
|
||||||
|
if model.fused_attn_module_type is not None and not model.injected_fused_attention:
|
||||||
|
peft_types = [PeftType.LORA.value, PeftType.ADALORA.value]
|
||||||
|
warnings.warn(
|
||||||
|
f"You can just ignore this warning if the peft type you use isn't in {peft_types}.\n"
|
||||||
|
f"{model.__class__.__name__} supports injecting fused attention but not enables this time. "
|
||||||
|
"If you are training adapters, you must also disable fused attention injection when loading quantized "
|
||||||
|
"base model at inference time, otherwise adapters may not be added to base model properly. "
|
||||||
|
"If you are loading adapters to do inference, you can reference to adapter's config file to check "
|
||||||
|
"whether the adapters are trained using base model that not enable fused attention injection."
|
||||||
|
)
|
||||||
|
if model.injected_fused_mlp:
|
||||||
|
raise NotImplementedError("GPTQ model that enables fused mlp injection is not supported to integrate with peft.")
|
||||||
|
|
||||||
|
if train_mode:
|
||||||
|
peft_type = peft_config.peft_type
|
||||||
|
if not isinstance(peft_type, str):
|
||||||
|
peft_type = peft_type.value
|
||||||
|
if peft_type in [PeftType.LORA.value, PeftType.ADALORA.value]:
|
||||||
|
if auto_find_all_linears:
|
||||||
|
peft_config.target_modules = find_all_linear_names(model, ignore_lm_head=True)
|
||||||
|
if peft_type == PeftType.LORA.value and not isinstance(peft_config, GPTQLoraConfig):
|
||||||
|
peft_config = GPTQLoraConfig(**peft_config.to_dict())
|
||||||
|
if peft_type == PeftType.ADALORA.value and not isinstance(peft_config, GPTQAdaLoraConfig):
|
||||||
|
peft_config = GPTQAdaLoraConfig(**peft_config.to_dict())
|
||||||
|
peft_config.injected_fused_attention = model.injected_fused_attention
|
||||||
|
peft_config.injected_fused_mlp = model.injected_fused_mlp
|
||||||
|
if peft_type == PeftType.ADAPTION_PROMPT.value:
|
||||||
|
if peft_config.adapter_layers > model.config.num_hidden_layers:
|
||||||
|
warnings.warn(
|
||||||
|
f"model has only {model.config.num_hidden_layers} layers "
|
||||||
|
f"but adapter_layers is set to {peft_config.adapter_layers}, "
|
||||||
|
f"will reset value to {model.config.num_hidden_layers}."
|
||||||
|
)
|
||||||
|
peft_config.adapter_layers = model.config.num_hidden_layers
|
||||||
|
if model.injected_fused_attention:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"model with fused attention injected isn't supported to use ADAPTION_PROMPT peft type yet."
|
||||||
|
)
|
||||||
|
|
||||||
|
with hijack_peft_mappings():
|
||||||
|
try:
|
||||||
|
if train_mode:
|
||||||
|
peft_model = get_peft_model(model.model, peft_config)
|
||||||
|
else:
|
||||||
|
peft_model = PeftModel.from_pretrained(model.model, model_id, adapter_name)
|
||||||
|
except:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"{model.__class__.__name__} not support {peft_config.peft_type.value} peft type yet."
|
||||||
|
)
|
||||||
|
|
||||||
|
return peft_model
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"GPTQLoraConfig",
|
||||||
|
"GPTQLoraModel",
|
||||||
|
"GPTQAdaLoraConfig",
|
||||||
|
"GPTQAdaLoraModel",
|
||||||
|
"find_all_linear_names",
|
||||||
|
"get_gptq_peft_model"
|
||||||
|
]
|
215
auto_gptq/utils/perplexity_utils.py
Normal file
215
auto_gptq/utils/perplexity_utils.py
Normal file
|
@ -0,0 +1,215 @@
|
||||||
|
import sys
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from tqdm import tqdm
|
||||||
|
from datasets import load_dataset
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||||
|
|
||||||
|
|
||||||
|
class Perplexity:
|
||||||
|
"""
|
||||||
|
A class for calculating the perplexity of a language model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, model, tokenizer, dataset_path='wikitext', dataset_name=None, split='test', text_column='text'):
|
||||||
|
"""
|
||||||
|
Calculate perplexity using the same method as seen in llama.cpp.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
model : AutoModelForCausalLM
|
||||||
|
The language model for which the perplexity is calculated.
|
||||||
|
tokenizer : AutoTokenizer
|
||||||
|
The tokenizer corresponding to the model.
|
||||||
|
device : str, optional
|
||||||
|
The device to run the calculations on. If auto, the device that your model uses
|
||||||
|
will be the device used for these calculations. Default is 'auto'.
|
||||||
|
dataset_path : str, optional
|
||||||
|
The path to the dataset on the Hugging Face dataset hub. Default is 'wikitext'.
|
||||||
|
dataset_name : str, optional
|
||||||
|
The name of the dataset. Default is None.
|
||||||
|
split : str, optional
|
||||||
|
The split of the dataset to use. Default is 'test'.
|
||||||
|
text_column : str, optional
|
||||||
|
The name of the column in the dataset that contains the text data. Default is 'text'.
|
||||||
|
"""
|
||||||
|
self._model = model
|
||||||
|
self._tokenizer = tokenizer
|
||||||
|
self._dataset_path = dataset_path
|
||||||
|
self._dataset_name = dataset_name
|
||||||
|
self._split = split
|
||||||
|
self._text_column = text_column
|
||||||
|
self._text = self._prepare_data()
|
||||||
|
|
||||||
|
def _get_device(self):
|
||||||
|
if torch.backends.mps.is_available():
|
||||||
|
return 'mps'
|
||||||
|
elif torch.cuda.is_available():
|
||||||
|
return 'cuda:0'
|
||||||
|
else:
|
||||||
|
return 'cpu'
|
||||||
|
|
||||||
|
def _prepare_data(self):
|
||||||
|
"""
|
||||||
|
Prepares the dataset by loading and formatting.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
The formatted dataset as a single string.
|
||||||
|
"""
|
||||||
|
if self._dataset_path == 'wikitext':
|
||||||
|
self._dataset_name = 'wikitext-2-raw-v1'
|
||||||
|
|
||||||
|
# Load the dataset
|
||||||
|
data = load_dataset(self._dataset_path, self._dataset_name, split=self._split)
|
||||||
|
# Format the text column of the dataset
|
||||||
|
text_list = [' \n' if s == '' else s for s in data[self._text_column]]
|
||||||
|
return ''.join(text_list)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def softmax(logits):
|
||||||
|
"""
|
||||||
|
Static method for applying the softmax function.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
logits : np.ndarray
|
||||||
|
The input to the softmax function.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.ndarray
|
||||||
|
The output of the softmax function.
|
||||||
|
"""
|
||||||
|
e_x = np.exp(logits - np.max(logits))
|
||||||
|
return e_x / e_x.sum(axis=0)
|
||||||
|
|
||||||
|
def calculate_perplexity(self, n_ctx=512, n_batch=512):
|
||||||
|
"""
|
||||||
|
Calculates the perplexity of the language model.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
n_ctx : int
|
||||||
|
The context size.
|
||||||
|
n_batch : int
|
||||||
|
The batch size.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
The list of perplexity scores calculated.
|
||||||
|
"""
|
||||||
|
# Tokenize the text
|
||||||
|
self._tokenizer.model_max_length = sys.maxsize
|
||||||
|
tokens = self._tokenizer(self._text, truncation=False, return_tensors='pt').input_ids.to(self._model.device)
|
||||||
|
|
||||||
|
nll = 0.0 # Negative log likelihood
|
||||||
|
count = 0 # Counter for processed tokens
|
||||||
|
curr_ppl = 0
|
||||||
|
all_perplexity = []
|
||||||
|
|
||||||
|
with tqdm(range(len(tokens[0]) // n_ctx), desc="Perplexity: - ") as progress:
|
||||||
|
for i in progress:
|
||||||
|
# Process each batch of tokens
|
||||||
|
nll, count = self._process_batch(i, n_ctx, n_batch, tokens, nll, count)
|
||||||
|
|
||||||
|
# Calculate and display the current perplexity
|
||||||
|
curr_ppl = np.exp(nll / count)
|
||||||
|
all_perplexity.append(curr_ppl)
|
||||||
|
progress.set_description(f"Perplexity: {curr_ppl:.4f}")
|
||||||
|
|
||||||
|
return all_perplexity
|
||||||
|
|
||||||
|
def _process_batch(self, i, n_ctx, n_batch, tokens, nll, count):
|
||||||
|
"""
|
||||||
|
Processes each batch of tokens.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
i : int
|
||||||
|
The batch index.
|
||||||
|
n_ctx : int
|
||||||
|
The context size.
|
||||||
|
n_batch : int
|
||||||
|
The batch size.
|
||||||
|
tokens : torch.Tensor
|
||||||
|
The tokenized text.
|
||||||
|
nll : float
|
||||||
|
The current negative log likelihood.
|
||||||
|
count : int
|
||||||
|
The current count of processed tokens.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
The updated negative log likelihood.
|
||||||
|
int
|
||||||
|
The updated count of processed tokens.
|
||||||
|
"""
|
||||||
|
start = i * n_ctx
|
||||||
|
end = start + n_ctx
|
||||||
|
|
||||||
|
num_batches = (n_ctx + n_batch - 1) // n_batch
|
||||||
|
|
||||||
|
logits = []
|
||||||
|
|
||||||
|
for j in range(num_batches):
|
||||||
|
batch_start = start + j * n_batch
|
||||||
|
batch_size = min(end - batch_start, n_batch)
|
||||||
|
|
||||||
|
token_org = tokens[0][batch_start].item()
|
||||||
|
|
||||||
|
if j == 0:
|
||||||
|
# Replace the first token with the BOS token
|
||||||
|
tokens[0][batch_start] = self._tokenizer.bos_token_id
|
||||||
|
|
||||||
|
# Compute the logits for the current batch of tokens
|
||||||
|
batch_logits = self._compute_batch_logits(tokens, batch_start, batch_size)
|
||||||
|
|
||||||
|
tokens[0][batch_start] = token_org
|
||||||
|
|
||||||
|
logits.append(batch_logits)
|
||||||
|
|
||||||
|
# We rely on the fact that attention in the forward pass only looks at previous
|
||||||
|
# tokens here, so the logits returned for each token are an accurate representation
|
||||||
|
# of what the model would have predicted at that point.
|
||||||
|
#
|
||||||
|
# Example, we have a context window of 512, we will compute perplexity for each of the
|
||||||
|
# last 256 tokens. Then, we split the input up into context window size chunks to
|
||||||
|
# process the entire prompt.
|
||||||
|
|
||||||
|
for j in range(min(512, n_ctx // 2), n_ctx - 1):
|
||||||
|
tok_logits = logits[0][0][j].cpu().numpy()
|
||||||
|
# Compute the probability of the next token
|
||||||
|
prob = self.softmax(tok_logits)[tokens[0][start + j + 1]]
|
||||||
|
|
||||||
|
# Update the negative log likelihood and the count of processed tokens
|
||||||
|
nll += -np.log(prob, where=prob>0)
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
return nll, count
|
||||||
|
|
||||||
|
def _compute_batch_logits(self, tokens, batch_start, batch_size):
|
||||||
|
"""
|
||||||
|
Computes the logits for a batch of tokens.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
tokens : torch.Tensor
|
||||||
|
The tokenized text.
|
||||||
|
batch_start : int
|
||||||
|
The start index of the batch.
|
||||||
|
batch_size : int
|
||||||
|
The size of the batch.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
torch.Tensor
|
||||||
|
The logits for the batch of tokens.
|
||||||
|
"""
|
||||||
|
# Compute the logits without keeping track of gradients
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = self._model(tokens[:, batch_start:batch_start+batch_size])
|
||||||
|
return outputs.logits.detach()
|
|
@ -172,16 +172,16 @@ void vecquant4matmul_faster_old(
|
||||||
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||||
m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
|
m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
|
||||||
m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
|
m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
|
||||||
m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
|
m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
|
||||||
m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
|
m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
|
||||||
|
|
||||||
m.def("vecquant2matmul_old", &vecquant2matmul_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
|
m.def("vecquant2matmul_old", &vecquant2matmul_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
|
||||||
m.def("vecquant3matmul_old", &vecquant3matmul_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
|
m.def("vecquant3matmul_old", &vecquant3matmul_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
|
||||||
m.def("vecquant4matmul_old", &vecquant4matmul_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
|
m.def("vecquant4matmul_old", &vecquant4matmul_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
|
||||||
m.def("vecquant8matmul_old", &vecquant8matmul_old, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
|
m.def("vecquant8matmul_old", &vecquant8matmul_old, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
|
||||||
m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
|
m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA), faster version");
|
||||||
m.def("vecquant3matmul_faster_old", &vecquant3matmul_faster_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA), faster version");
|
m.def("vecquant3matmul_faster_old", &vecquant3matmul_faster_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA), faster version");
|
||||||
m.def("vecquant4matmul_faster_old", &vecquant4matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
|
m.def("vecquant4matmul_faster_old", &vecquant4matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
|
||||||
}
|
}
|
187
autogptq_cuda/autogptq_cuda_64.cpp
Normal file
187
autogptq_cuda/autogptq_cuda_64.cpp
Normal file
|
@ -0,0 +1,187 @@
|
||||||
|
#include <torch/all.h>
|
||||||
|
#include <torch/python.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
|
void vecquant2matmul_cuda(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
torch::Tensor g_idx
|
||||||
|
);
|
||||||
|
|
||||||
|
void vecquant2matmul(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
torch::Tensor g_idx
|
||||||
|
) {
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
|
||||||
|
vecquant2matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vecquant3matmul_cuda(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
torch::Tensor g_idx
|
||||||
|
);
|
||||||
|
|
||||||
|
void vecquant3matmul(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
torch::Tensor g_idx
|
||||||
|
) {
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
|
||||||
|
vecquant3matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vecquant4matmul_cuda(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
torch::Tensor g_idx
|
||||||
|
);
|
||||||
|
|
||||||
|
void vecquant4matmul(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
torch::Tensor g_idx
|
||||||
|
) {
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
|
||||||
|
vecquant4matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vecquant8matmul_cuda(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
torch::Tensor g_idx
|
||||||
|
);
|
||||||
|
|
||||||
|
void vecquant8matmul(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
torch::Tensor g_idx
|
||||||
|
) {
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
|
||||||
|
vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// old
|
||||||
|
|
||||||
|
void vecquant2matmul_cuda_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize
|
||||||
|
);
|
||||||
|
|
||||||
|
void vecquant2matmul_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize
|
||||||
|
) {
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
|
||||||
|
vecquant2matmul_cuda_old(vec, mat, mul, scales, zeros,groupsize);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vecquant3matmul_cuda_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize
|
||||||
|
);
|
||||||
|
|
||||||
|
void vecquant3matmul_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize
|
||||||
|
) {
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
|
||||||
|
vecquant3matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vecquant4matmul_cuda_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize
|
||||||
|
);
|
||||||
|
|
||||||
|
void vecquant4matmul_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize
|
||||||
|
) {
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
|
||||||
|
vecquant4matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vecquant8matmul_cuda_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize
|
||||||
|
);
|
||||||
|
|
||||||
|
void vecquant8matmul_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize
|
||||||
|
) {
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
|
||||||
|
vecquant8matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vecquant2matmul_faster_cuda_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize, int vec_height
|
||||||
|
);
|
||||||
|
|
||||||
|
void vecquant2matmul_faster_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize, int vec_height
|
||||||
|
) {
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
|
||||||
|
vecquant2matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vecquant3matmul_faster_cuda_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize, int vec_height
|
||||||
|
);
|
||||||
|
|
||||||
|
void vecquant3matmul_faster_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize, int vec_height
|
||||||
|
) {
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
|
||||||
|
vecquant3matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vecquant4matmul_faster_cuda_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize, int vec_height
|
||||||
|
);
|
||||||
|
|
||||||
|
void vecquant4matmul_faster_old(
|
||||||
|
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor scales, torch::Tensor zeros,
|
||||||
|
int groupsize, int vec_height
|
||||||
|
) {
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
|
||||||
|
vecquant4matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
||||||
|
m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
|
||||||
|
m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
|
||||||
|
m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
|
||||||
|
m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
|
||||||
|
|
||||||
|
m.def("vecquant2matmul_old", &vecquant2matmul_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
|
||||||
|
m.def("vecquant3matmul_old", &vecquant3matmul_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
|
||||||
|
m.def("vecquant4matmul_old", &vecquant4matmul_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
|
||||||
|
m.def("vecquant8matmul_old", &vecquant8matmul_old, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
|
||||||
|
m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA), faster version");
|
||||||
|
m.def("vecquant3matmul_faster_old", &vecquant3matmul_faster_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA), faster version");
|
||||||
|
m.def("vecquant4matmul_faster_old", &vecquant4matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
|
||||||
|
}
|
|
@ -7,29 +7,66 @@
|
||||||
// atomicAdd for double-precision floating-point numbers on hardware with
|
// atomicAdd for double-precision floating-point numbers on hardware with
|
||||||
// compute capability < 6.0 from:
|
// compute capability < 6.0 from:
|
||||||
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
|
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
|
// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
|
||||||
__device__ double atomicAdd(
|
// __device__ double atomicAdd(
|
||||||
double* address,
|
// double* address,
|
||||||
double val
|
// double val
|
||||||
) {
|
// ) {
|
||||||
unsigned long long int* address_as_ull = (unsigned long long int*)address;
|
// unsigned long long int* address_as_ull = (unsigned long long int*)address;
|
||||||
unsigned long long int old = *address_as_ull, assumed;
|
// unsigned long long int old = *address_as_ull, assumed;
|
||||||
|
//
|
||||||
|
// do {
|
||||||
|
// assumed = old;
|
||||||
|
// old = atomicCAS(
|
||||||
|
// address_as_ull,
|
||||||
|
// assumed,
|
||||||
|
// __double_as_longlong(val + __longlong_as_double(assumed))
|
||||||
|
// );
|
||||||
|
//
|
||||||
|
// // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
|
||||||
|
// } while (assumed != old);
|
||||||
|
//
|
||||||
|
// return __longlong_as_double(old);
|
||||||
|
// }
|
||||||
|
// #endif
|
||||||
|
|
||||||
do {
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
|
||||||
assumed = old;
|
// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
|
||||||
old = atomicCAS(
|
__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
|
||||||
address_as_ull,
|
unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));
|
||||||
assumed,
|
unsigned int old = *address_as_ui;
|
||||||
__double_as_longlong(val + __longlong_as_double(assumed))
|
unsigned int assumed;
|
||||||
);
|
|
||||||
|
|
||||||
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
|
do {
|
||||||
} while (assumed != old);
|
assumed = old;
|
||||||
|
unsigned short hsum = reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
|
||||||
|
hsum += val;
|
||||||
|
old = reinterpret_cast<size_t>(address) & 2
|
||||||
|
? (old & 0xffff) | (hsum << 16)
|
||||||
|
: (old & 0xffff0000) | hsum;
|
||||||
|
old = atomicCAS(address_as_ui, assumed, old);
|
||||||
|
|
||||||
return __longlong_as_double(old);
|
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
|
||||||
|
} while (assumed != old);
|
||||||
|
}
|
||||||
|
__device__ __forceinline__ void atomicAdd(__half* address, c10::Half val) {
|
||||||
|
unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
|
||||||
|
unsigned int old = *address_as_ui;
|
||||||
|
unsigned int assumed;
|
||||||
|
|
||||||
|
do {
|
||||||
|
assumed = old;
|
||||||
|
__half_raw hsum;
|
||||||
|
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
|
||||||
|
half tmpres = __hadd(hsum, val);
|
||||||
|
hsum = __half_raw(tmpres);
|
||||||
|
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
|
||||||
|
old = atomicCAS(address_as_ui, assumed, old);
|
||||||
|
} while (assumed != old);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
template <typename scalar_t>
|
template <typename scalar_t>
|
||||||
__global__ void VecQuant2MatMulKernel(
|
__global__ void VecQuant2MatMulKernel(
|
||||||
const scalar_t* __restrict__ vec,
|
const scalar_t* __restrict__ vec,
|
||||||
|
@ -69,7 +106,7 @@ __global__ void VecQuant4MatMulKernel(
|
||||||
const int* __restrict__ zeros,
|
const int* __restrict__ zeros,
|
||||||
const int* __restrict__ g_idx,
|
const int* __restrict__ g_idx,
|
||||||
int batch,
|
int batch,
|
||||||
int vec_height,
|
int vec_height,
|
||||||
int height,
|
int height,
|
||||||
int width,
|
int width,
|
||||||
int zero_width
|
int zero_width
|
1428
autogptq_cuda/autogptq_cuda_kernel_64.cu
Normal file
1428
autogptq_cuda/autogptq_cuda_kernel_64.cu
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,4 +1,6 @@
|
||||||
## <center>News or Update</center>
|
## <center>News or Update</center>
|
||||||
|
- 2023-07-26 - (Update) - An elegant [PPL benchmark script](examples/benchmark/perplexity.py) to get results that can be fairly compared with other libraries such as `llama.cpp`.
|
||||||
|
- 2023-06-05 - (Update) - Integrate with 🤗 peft to use gptq quantized model to train adapters, support LoRA, AdaLoRA, AdaptionPrompt, etc.
|
||||||
- 2023-05-30 - (Update) - support download/upload quantized model from/to 🤗 Hub.
|
- 2023-05-30 - (Update) - support download/upload quantized model from/to 🤗 Hub.
|
||||||
- 2023-05-27 - (Update) - Support quantization and inference for `gpt_bigcode`, `codegen` and `RefineWeb/RefineWebModel`(falcon) model types.
|
- 2023-05-27 - (Update) - Support quantization and inference for `gpt_bigcode`, `codegen` and `RefineWeb/RefineWebModel`(falcon) model types.
|
||||||
- 2023-05-04 - (Update) - Support using faster cuda kernel when `not desc_act or group_size == -1`
|
- 2023-05-04 - (Update) - Support using faster cuda kernel when `not desc_act or group_size == -1`
|
||||||
|
|
|
@ -13,9 +13,9 @@ python basic_usage.py
|
||||||
|
|
||||||
This script also showcases how to download/upload quantized model from/to 🤗 Hub, to enable those features, you can uncomment the commented codes.
|
This script also showcases how to download/upload quantized model from/to 🤗 Hub, to enable those features, you can uncomment the commented codes.
|
||||||
|
|
||||||
To Execute `basic_usage_with_wikitext2.py`, using command like this:
|
To Execute `basic_usage_wikitext2.py`, using command like this:
|
||||||
```shell
|
```shell
|
||||||
python basic_usage_with_wikitext2.py
|
python basic_usage_wikitext2.py
|
||||||
```
|
```
|
||||||
> Note: There is about 0.6 ppl degrade on opt-125m model using AutoGPTQ, compared to GPTQ-for-LLaMa.
|
> Note: There is about 0.6 ppl degrade on opt-125m model using AutoGPTQ, compared to GPTQ-for-LLaMa.
|
||||||
|
|
||||||
|
@ -66,11 +66,48 @@ Use `--help` flag to see detailed descriptions for more command arguments.
|
||||||
> Commands in this chapter should be run under `benchmark` folder.
|
> Commands in this chapter should be run under `benchmark` folder.
|
||||||
|
|
||||||
### Generation Speed
|
### Generation Speed
|
||||||
`generation_speed.py` scripts gives an example of how to benchmark the generations speed of pretrained and quantized models that `auto_gptq` supports, this benchmarks model generation speed in tokens/s metric.
|
`generation_speed.py` script gives an example of how to benchmark the generations speed of pretrained and quantized models that `auto_gptq` supports, this benchmarks model generation speed in tokens/s metric.
|
||||||
|
|
||||||
To eexcute this script, using command like this:
|
To execute this script, using command like this:
|
||||||
```shell
|
```shell
|
||||||
CUDA_VISIBLE_DEVICES=0 python generation_speed.py --model_name_pr_path PATH/TO/MODEL/DIR
|
CUDA_VISIBLE_DEVICES=0 python generation_speed.py --model_name_pr_path PATH/TO/MODEL/DIR
|
||||||
```
|
```
|
||||||
|
|
||||||
Use `--help` flag to see detailed descriptions for more command arguments.
|
Use `--help` flag to see detailed descriptions for more command arguments.
|
||||||
|
|
||||||
|
## PEFT
|
||||||
|
> Commands in this chapter should be run under `peft` folder.
|
||||||
|
|
||||||
|
### Lora
|
||||||
|
`peft_lora_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's lora adapter using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
|
||||||
|
|
||||||
|
To execute this script, using command like this:
|
||||||
|
```shell
|
||||||
|
CUDA_VISIBLE_DEVICES=0 python peft_lora_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
|
||||||
|
```
|
||||||
|
|
||||||
|
Use `--help` flag to see detailed descriptions for more command arguments.
|
||||||
|
|
||||||
|
### AdaLora
|
||||||
|
`peft_adalora_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's adalora adapter using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
|
||||||
|
|
||||||
|
To execute this script, using command like this:
|
||||||
|
```shell
|
||||||
|
CUDA_VISIBLE_DEVICES=0 python peft_adalora_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
|
||||||
|
```
|
||||||
|
|
||||||
|
Use `--help` flag to see detailed descriptions for more command arguments.
|
||||||
|
|
||||||
|
|
||||||
|
### AdaptionPrompt
|
||||||
|
`peft_adaption_prompt_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's adaption_prompt adapter(llama-adapter) using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
|
||||||
|
|
||||||
|
To execute this script, using command like this:
|
||||||
|
```shell
|
||||||
|
CUDA_VISIBLE_DEVICES=0 python peft_adaption_prompt_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
|
||||||
|
```
|
||||||
|
|
||||||
|
Use `--help` flag to see detailed descriptions for more command arguments.
|
||||||
|
|
||||||
|
If you want to try models other than llama, you can install peft from source using [this branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt), see [here](https://github.com/PanQiWei/peft/blob/a5f8f74f07591efe5eb3d08cb1b31b981e84a069/src/peft/tuners/adaption_prompt.py#L235)
|
||||||
|
to check what other models are also supported, and with this branch installed, you can also use `ADAPTION_PROMPT_V2` peft type (llama-adapter-v2) by simply replace `AdaptionPromptConfig` with `AdaptionPromptV2Config` in the script.
|
File diff suppressed because it is too large
Load diff
|
@ -144,7 +144,9 @@ def load_model_tokenizer(
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
use_triton: bool = False,
|
use_triton: bool = False,
|
||||||
use_safetensors: bool = False,
|
use_safetensors: bool = False,
|
||||||
use_fast_tokenizer: bool = False
|
use_fast_tokenizer: bool = False,
|
||||||
|
inject_fused_attention: bool = True,
|
||||||
|
inject_fused_mlp: bool = True
|
||||||
):
|
):
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
pretrained_model_name_or_path=tokenizer_name_or_path or model_name_or_path,
|
pretrained_model_name_or_path=tokenizer_name_or_path or model_name_or_path,
|
||||||
|
@ -163,12 +165,12 @@ def load_model_tokenizer(
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
model = AutoGPTQForCausalLM.from_quantized(
|
model = AutoGPTQForCausalLM.from_quantized(
|
||||||
save_dir=model_name_or_path,
|
model_name_or_path,
|
||||||
max_memory=max_memory,
|
max_memory=max_memory,
|
||||||
low_cpu_mem_usage=True,
|
low_cpu_mem_usage=True,
|
||||||
use_triton=use_triton,
|
use_triton=use_triton,
|
||||||
inject_fused_attention=True,
|
inject_fused_attention=inject_fused_attention,
|
||||||
inject_fused_mlp=True,
|
inject_fused_mlp=inject_fused_mlp,
|
||||||
use_cuda_fp16=True,
|
use_cuda_fp16=True,
|
||||||
quantize_config=quantize_config,
|
quantize_config=quantize_config,
|
||||||
model_basename=model_basename,
|
model_basename=model_basename,
|
||||||
|
@ -232,6 +234,8 @@ def main():
|
||||||
parser.add_argument("--use_triton", action="store_true")
|
parser.add_argument("--use_triton", action="store_true")
|
||||||
parser.add_argument("--use_safetensors", action="store_true")
|
parser.add_argument("--use_safetensors", action="store_true")
|
||||||
parser.add_argument("--use_fast_tokenizer", action="store_true")
|
parser.add_argument("--use_fast_tokenizer", action="store_true")
|
||||||
|
parser.add_argument("--no_inject_fused_attention", action="store_true")
|
||||||
|
parser.add_argument("--no_inject_fused_mlp", action="store_true")
|
||||||
parser.add_argument("--num_samples", type=int, default=10)
|
parser.add_argument("--num_samples", type=int, default=10)
|
||||||
parser.add_argument("--per_gpu_max_memory", type=int, default=None)
|
parser.add_argument("--per_gpu_max_memory", type=int, default=None)
|
||||||
parser.add_argument("--cpu_max_memory", type=int, default=None)
|
parser.add_argument("--cpu_max_memory", type=int, default=None)
|
||||||
|
@ -269,7 +273,9 @@ def main():
|
||||||
trust_remote_code=args.trust_remote_code,
|
trust_remote_code=args.trust_remote_code,
|
||||||
use_triton=args.use_triton,
|
use_triton=args.use_triton,
|
||||||
use_safetensors=args.use_safetensors,
|
use_safetensors=args.use_safetensors,
|
||||||
use_fast_tokenizer=args.use_fast_tokenizer
|
use_fast_tokenizer=args.use_fast_tokenizer,
|
||||||
|
inject_fused_attention=not args.no_inject_fused_attention,
|
||||||
|
inject_fused_mlp=not args.no_inject_fused_mlp
|
||||||
)
|
)
|
||||||
end = time.time()
|
end = time.time()
|
||||||
logger.info(f"model and tokenizer loading time: {end - start:.4f}s")
|
logger.info(f"model and tokenizer loading time: {end - start:.4f}s")
|
||||||
|
@ -282,7 +288,9 @@ def main():
|
||||||
model.warmup_triton()
|
model.warmup_triton()
|
||||||
|
|
||||||
logger.info("loading data")
|
logger.info("loading data")
|
||||||
examples = load_data("dataset/alpaca_data_cleaned.json", tokenizer, args.num_samples, args.max_new_tokens)
|
examples = load_data(
|
||||||
|
"../quantization/dataset/alpaca_data_cleaned.json", tokenizer, args.num_samples, args.max_new_tokens
|
||||||
|
)
|
||||||
|
|
||||||
generation_config = GenerationConfig(
|
generation_config = GenerationConfig(
|
||||||
num_beams=args.num_beams,
|
num_beams=args.num_beams,
|
||||||
|
|
86
examples/benchmark/perplexity.py
Normal file
86
examples/benchmark/perplexity.py
Normal file
|
@ -0,0 +1,86 @@
|
||||||
|
import os
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from auto_gptq.utils import Perplexity
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
"""
|
||||||
|
Example usage.
|
||||||
|
|
||||||
|
Default usage with GPT2 model:
|
||||||
|
python examples/benchmark/perplexity.py
|
||||||
|
|
||||||
|
Specify GPTQ quantized model:
|
||||||
|
python examples/benchmark/perplexity.py \
|
||||||
|
--model_name TheBloke/open-llama-7b-open-instruct-GPTQ \
|
||||||
|
--model_basename gptq_model-4bit-128g \
|
||||||
|
--is_quantized
|
||||||
|
|
||||||
|
Change your dataset:
|
||||||
|
python examples/benchmark/perplexity.py --dataset_path tiny_shakespeare
|
||||||
|
|
||||||
|
"""
|
||||||
|
parser = argparse.ArgumentParser(description="Calculate Perplexity for a model.")
|
||||||
|
parser.add_argument("--model_name", type=str, default='gpt2', help="Model name.")
|
||||||
|
parser.add_argument("--model_basename", type=str, default=None, help="Model file's basename.")
|
||||||
|
parser.add_argument("--n_ctx", type=int, default=512, help="Context size.")
|
||||||
|
parser.add_argument("--n_batch", type=int, default=512, help="Batch size.")
|
||||||
|
parser.add_argument("--dataset_path", type=str, default='wikitext', help="Path to the dataset.")
|
||||||
|
parser.add_argument("--dataset_name", type=str, default=None, help="Name of the dataset.")
|
||||||
|
parser.add_argument("--split", type=str, default='test', help="Dataset split to use.")
|
||||||
|
parser.add_argument("--text_column", type=str, default='text', help="Column in the dataset containing the text.")
|
||||||
|
parser.add_argument("--per_gpu_max_memory", type=int, default=None, help="Max memory used in each GPU.")
|
||||||
|
parser.add_argument("--cpu_max_memory", type=int, default=None, help="Mx memory used in CPU.")
|
||||||
|
parser.add_argument("--is_quantized", action="store_true", help="Is the model GPTQ quantized?")
|
||||||
|
parser.add_argument("--use_safetensors", action="store_true", help="Whether to use safetensors model file")
|
||||||
|
parser.add_argument("--use_fast_tokenizer", action="store_true", help="Wheter to use fast tokenizer")
|
||||||
|
parser.add_argument("--trust_remote_code", action="store_true", help="Whether to use remote code")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=args.use_fast_tokenizer)
|
||||||
|
if not tokenizer.pad_token_id:
|
||||||
|
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||||
|
|
||||||
|
max_memory = dict()
|
||||||
|
if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0:
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
max_memory.update(
|
||||||
|
{i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())}
|
||||||
|
)
|
||||||
|
if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory:
|
||||||
|
max_memory["cpu"] = f"{args.cpu_max_memory}GIB"
|
||||||
|
if not max_memory:
|
||||||
|
max_memory = None
|
||||||
|
|
||||||
|
if args.is_quantized:
|
||||||
|
from auto_gptq import AutoGPTQForCausalLM
|
||||||
|
|
||||||
|
model = AutoGPTQForCausalLM.from_quantized(
|
||||||
|
args.model_name,
|
||||||
|
low_cpu_mem_usage=True,
|
||||||
|
device_map="auto",
|
||||||
|
max_memory=max_memory,
|
||||||
|
model_basename=args.model_basename,
|
||||||
|
use_safetensors=args.use_safetensors,
|
||||||
|
trust_remote_code=args.trust_remote_code,
|
||||||
|
inject_fused_mlp=False,
|
||||||
|
inject_fused_attention=False
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
from transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
args.model_name,
|
||||||
|
low_cpu_mem_usage=True,
|
||||||
|
device_map="auto",
|
||||||
|
max_memory=max_memory,
|
||||||
|
torch_dtype=torch.float16,
|
||||||
|
trust_remote_code=args.trust_remote_code
|
||||||
|
)
|
||||||
|
|
||||||
|
ppl = Perplexity(model, tokenizer, args.dataset_path, args.dataset_name, args.split, args.text_column)
|
||||||
|
ppl.calculate_perplexity(args.n_ctx, args.n_batch)
|
169
examples/peft/peft_adalora_clm_instruction_tuning.py
Normal file
169
examples/peft/peft_adalora_clm_instruction_tuning.py
Normal file
|
@ -0,0 +1,169 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from datasets import Dataset
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
from tqdm import tqdm
|
||||||
|
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
|
from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
|
||||||
|
from auto_gptq.utils.data_utils import make_data_block, collate_data
|
||||||
|
from auto_gptq.utils.peft_utils import GPTQAdaLoraConfig
|
||||||
|
from peft import TaskType
|
||||||
|
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument("--model_name_or_path", type=str)
|
||||||
|
parser.add_argument("--lr", type=float, default=3e-3)
|
||||||
|
parser.add_argument("--num_epochs", type=int, default=1)
|
||||||
|
parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
|
||||||
|
parser.add_argument("--block_max_length", type=int, default=1024, help="max length of data block(bunch of samples)")
|
||||||
|
parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
|
||||||
|
parser.add_argument("--use_fast_tokenizer", action="store_true")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
|
|
||||||
|
model_name_or_path = args.model_name_or_path
|
||||||
|
tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
|
||||||
|
|
||||||
|
lr = args.lr
|
||||||
|
num_epochs = args.num_epochs
|
||||||
|
|
||||||
|
# creating model
|
||||||
|
peft_config = GPTQAdaLoraConfig(
|
||||||
|
init_r=20,
|
||||||
|
target_r=16,
|
||||||
|
beta1=0.85,
|
||||||
|
beta2=0.85,
|
||||||
|
tinit=200,
|
||||||
|
tfinal=1000,
|
||||||
|
deltaT=10,
|
||||||
|
lora_alpha=32,
|
||||||
|
lora_dropout=0.1,
|
||||||
|
task_type=TaskType.CAUSAL_LM,
|
||||||
|
inference_mode=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
|
||||||
|
if not tokenizer.pad_token_id:
|
||||||
|
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||||
|
|
||||||
|
model = AutoGPTQForCausalLM.from_quantized(
|
||||||
|
model_name_or_path,
|
||||||
|
use_triton=True,
|
||||||
|
warmup_triton=False,
|
||||||
|
trainable=True,
|
||||||
|
inject_fused_attention=True,
|
||||||
|
inject_fused_mlp=False
|
||||||
|
)
|
||||||
|
model.warmup_triton()
|
||||||
|
device = model.device
|
||||||
|
model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
|
||||||
|
model.print_trainable_parameters()
|
||||||
|
|
||||||
|
# loading dataset
|
||||||
|
WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
|
||||||
|
WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
|
||||||
|
|
||||||
|
|
||||||
|
def ds_refactor_fn(samples):
|
||||||
|
instruction_data = samples["instruction"]
|
||||||
|
input_data = samples["input"]
|
||||||
|
output_data = samples["output"]
|
||||||
|
|
||||||
|
new_samples = {"prompt": [], "output": []}
|
||||||
|
for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
|
||||||
|
if input_txt:
|
||||||
|
prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
|
||||||
|
else:
|
||||||
|
prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
|
||||||
|
new_samples["prompt"].append(prompt)
|
||||||
|
new_samples["output"].append(output_txt)
|
||||||
|
|
||||||
|
return new_samples
|
||||||
|
|
||||||
|
|
||||||
|
ds = Dataset.from_generator(
|
||||||
|
lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
|
||||||
|
)
|
||||||
|
ds = ds.map(
|
||||||
|
make_data_block,
|
||||||
|
batched=True,
|
||||||
|
batch_size=len(ds),
|
||||||
|
num_proc=1,
|
||||||
|
remove_columns=ds.column_names,
|
||||||
|
keep_in_memory=True,
|
||||||
|
load_from_cache_file=False,
|
||||||
|
fn_kwargs={
|
||||||
|
"prompt_col_name": "prompt",
|
||||||
|
"label_col_name": "output",
|
||||||
|
"tokenizer": tokenizer,
|
||||||
|
"preprocess_fn": ds_refactor_fn,
|
||||||
|
"sample_max_len": args.sample_max_length,
|
||||||
|
"block_max_len": args.block_max_length,
|
||||||
|
"add_eos_token": True,
|
||||||
|
"truncate_prompt": False,
|
||||||
|
"merge_prompt_label": True
|
||||||
|
}
|
||||||
|
)
|
||||||
|
ds = ds.train_test_split(test_size=len(ds) // 10)
|
||||||
|
train_ds, eval_ds = ds["train"], ds["test"]
|
||||||
|
collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
|
||||||
|
train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
|
||||||
|
eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
|
||||||
|
|
||||||
|
# optimizer and lr scheduler
|
||||||
|
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
|
||||||
|
lr_scheduler = get_linear_schedule_with_warmup(
|
||||||
|
optimizer=optimizer,
|
||||||
|
num_warmup_steps=0,
|
||||||
|
num_training_steps=(len(train_dataloader) * num_epochs),
|
||||||
|
)
|
||||||
|
model.base_model.peft_config["default"].total_step = len(train_dataloader) * num_epochs
|
||||||
|
|
||||||
|
# training and evaluation
|
||||||
|
with torch.cuda.amp.autocast():
|
||||||
|
global_step = 0
|
||||||
|
for epoch in range(num_epochs):
|
||||||
|
model.train()
|
||||||
|
total_loss = 0
|
||||||
|
progress_bar = tqdm(train_dataloader)
|
||||||
|
for step, batch in enumerate(progress_bar):
|
||||||
|
batch = {k: v.to(device) for k, v in batch.items()}
|
||||||
|
outputs = model(**batch)
|
||||||
|
loss = outputs.loss
|
||||||
|
total_loss += loss.detach().float()
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
lr_scheduler.step()
|
||||||
|
# Update the importance of low-rank matrices
|
||||||
|
# and allocate the budget accordingly.
|
||||||
|
model.base_model.update_and_allocate(global_step)
|
||||||
|
optimizer.zero_grad()
|
||||||
|
global_step += 1
|
||||||
|
|
||||||
|
progress_bar.set_postfix(loss=loss.item())
|
||||||
|
|
||||||
|
model.eval()
|
||||||
|
eval_loss = 0
|
||||||
|
eval_preds = []
|
||||||
|
for step, batch in enumerate(tqdm(eval_dataloader)):
|
||||||
|
batch = {k: v.to(device) for k, v in batch.items()}
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**batch)
|
||||||
|
loss = outputs.loss
|
||||||
|
eval_loss += loss.detach().float()
|
||||||
|
eval_preds.extend(
|
||||||
|
tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
eval_epoch_loss = eval_loss / len(eval_dataloader)
|
||||||
|
eval_ppl = torch.exp(eval_epoch_loss)
|
||||||
|
train_epoch_loss = total_loss / len(train_dataloader)
|
||||||
|
train_ppl = torch.exp(train_epoch_loss)
|
||||||
|
print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
|
||||||
|
|
||||||
|
model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))
|
158
examples/peft/peft_adaption_prompt_clm_instruction_tuning.py
Normal file
158
examples/peft/peft_adaption_prompt_clm_instruction_tuning.py
Normal file
|
@ -0,0 +1,158 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from datasets import Dataset
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
from tqdm import tqdm
|
||||||
|
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
|
from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
|
||||||
|
from auto_gptq.utils.data_utils import make_data_block, collate_data
|
||||||
|
from peft import TaskType, AdaptionPromptConfig
|
||||||
|
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument("--model_name_or_path", type=str)
|
||||||
|
parser.add_argument("--adapter_len", type=int, default=10)
|
||||||
|
parser.add_argument("--adapter_layers", type=int, default=30)
|
||||||
|
parser.add_argument("--lr", type=float, default=3e-3)
|
||||||
|
parser.add_argument("--num_epochs", type=int, default=1)
|
||||||
|
parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
|
||||||
|
parser.add_argument("--block_max_length", type=int, default=1024, help="max length of data block(bunch of samples)")
|
||||||
|
parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
|
||||||
|
parser.add_argument("--use_fast_tokenizer", action="store_true")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
|
|
||||||
|
model_name_or_path = args.model_name_or_path
|
||||||
|
tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
|
||||||
|
|
||||||
|
lr = args.lr
|
||||||
|
num_epochs = args.num_epochs
|
||||||
|
|
||||||
|
# creating model
|
||||||
|
peft_config = AdaptionPromptConfig(
|
||||||
|
adapter_len=args.adapter_len,
|
||||||
|
adapter_layers=args.adapter_layers,
|
||||||
|
task_type=TaskType.CAUSAL_LM,
|
||||||
|
inference_mode=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
|
||||||
|
if not tokenizer.pad_token_id:
|
||||||
|
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||||
|
|
||||||
|
model = AutoGPTQForCausalLM.from_quantized(
|
||||||
|
model_name_or_path,
|
||||||
|
use_triton=True,
|
||||||
|
warmup_triton=False,
|
||||||
|
trainable=True,
|
||||||
|
inject_fused_attention=False,
|
||||||
|
inject_fused_mlp=False
|
||||||
|
)
|
||||||
|
model.warmup_triton()
|
||||||
|
device = model.device
|
||||||
|
model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
|
||||||
|
model.print_trainable_parameters()
|
||||||
|
|
||||||
|
# loading dataset
|
||||||
|
WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
|
||||||
|
WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
|
||||||
|
|
||||||
|
|
||||||
|
def ds_refactor_fn(samples):
|
||||||
|
instruction_data = samples["instruction"]
|
||||||
|
input_data = samples["input"]
|
||||||
|
output_data = samples["output"]
|
||||||
|
|
||||||
|
new_samples = {"prompt": [], "output": []}
|
||||||
|
for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
|
||||||
|
if input_txt:
|
||||||
|
prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
|
||||||
|
else:
|
||||||
|
prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
|
||||||
|
new_samples["prompt"].append(prompt)
|
||||||
|
new_samples["output"].append(output_txt)
|
||||||
|
|
||||||
|
return new_samples
|
||||||
|
|
||||||
|
|
||||||
|
ds = Dataset.from_generator(
|
||||||
|
lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
|
||||||
|
)
|
||||||
|
ds = ds.map(
|
||||||
|
make_data_block,
|
||||||
|
batched=True,
|
||||||
|
batch_size=len(ds),
|
||||||
|
num_proc=1,
|
||||||
|
remove_columns=ds.column_names,
|
||||||
|
keep_in_memory=True,
|
||||||
|
load_from_cache_file=False,
|
||||||
|
fn_kwargs={
|
||||||
|
"prompt_col_name": "prompt",
|
||||||
|
"label_col_name": "output",
|
||||||
|
"tokenizer": tokenizer,
|
||||||
|
"preprocess_fn": ds_refactor_fn,
|
||||||
|
"sample_max_len": args.sample_max_length,
|
||||||
|
"block_max_len": args.block_max_length,
|
||||||
|
"add_eos_token": True,
|
||||||
|
"truncate_prompt": False,
|
||||||
|
"merge_prompt_label": True
|
||||||
|
}
|
||||||
|
)
|
||||||
|
ds = ds.train_test_split(test_size=len(ds) // 10)
|
||||||
|
train_ds, eval_ds = ds["train"], ds["test"]
|
||||||
|
collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
|
||||||
|
train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
|
||||||
|
eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
|
||||||
|
|
||||||
|
# optimizer and lr scheduler
|
||||||
|
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
|
||||||
|
lr_scheduler = get_linear_schedule_with_warmup(
|
||||||
|
optimizer=optimizer,
|
||||||
|
num_warmup_steps=0,
|
||||||
|
num_training_steps=(len(train_dataloader) * num_epochs),
|
||||||
|
)
|
||||||
|
|
||||||
|
# training and evaluation
|
||||||
|
with torch.cuda.amp.autocast():
|
||||||
|
for epoch in range(num_epochs):
|
||||||
|
model.train()
|
||||||
|
total_loss = 0
|
||||||
|
progress_bar = tqdm(train_dataloader)
|
||||||
|
for step, batch in enumerate(progress_bar):
|
||||||
|
batch = {k: v.to(device) for k, v in batch.items()}
|
||||||
|
outputs = model(**batch)
|
||||||
|
loss = outputs.loss
|
||||||
|
total_loss += loss.detach().float()
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
lr_scheduler.step()
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
progress_bar.set_postfix(loss=loss.item())
|
||||||
|
|
||||||
|
model.eval()
|
||||||
|
eval_loss = 0
|
||||||
|
eval_preds = []
|
||||||
|
for step, batch in enumerate(tqdm(eval_dataloader)):
|
||||||
|
batch = {k: v.to(device) for k, v in batch.items()}
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**batch)
|
||||||
|
loss = outputs.loss
|
||||||
|
eval_loss += loss.detach().float()
|
||||||
|
eval_preds.extend(
|
||||||
|
tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
eval_epoch_loss = eval_loss / len(eval_dataloader)
|
||||||
|
eval_ppl = torch.exp(eval_epoch_loss)
|
||||||
|
train_epoch_loss = total_loss / len(train_dataloader)
|
||||||
|
train_ppl = torch.exp(train_epoch_loss)
|
||||||
|
print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
|
||||||
|
|
||||||
|
model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))
|
158
examples/peft/peft_lora_clm_instruction_tuning.py
Normal file
158
examples/peft/peft_lora_clm_instruction_tuning.py
Normal file
|
@ -0,0 +1,158 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from datasets import Dataset
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
from tqdm import tqdm
|
||||||
|
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
|
||||||
|
|
||||||
|
from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
|
||||||
|
from auto_gptq.utils.data_utils import make_data_block, collate_data
|
||||||
|
from auto_gptq.utils.peft_utils import GPTQLoraConfig
|
||||||
|
from peft import TaskType
|
||||||
|
|
||||||
|
parser = ArgumentParser()
|
||||||
|
parser.add_argument("--model_name_or_path", type=str)
|
||||||
|
parser.add_argument("--lr", type=float, default=3e-5)
|
||||||
|
parser.add_argument("--num_epochs", type=int, default=1)
|
||||||
|
parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
|
||||||
|
parser.add_argument("--block_max_length", type=int, default=1024, help="max length of data block(bunch of samples)")
|
||||||
|
parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
|
||||||
|
parser.add_argument("--use_fast_tokenizer", action="store_true")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
|
|
||||||
|
model_name_or_path = args.model_name_or_path
|
||||||
|
tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
|
||||||
|
|
||||||
|
lr = args.lr
|
||||||
|
num_epochs = args.num_epochs
|
||||||
|
|
||||||
|
# creating model
|
||||||
|
peft_config = GPTQLoraConfig(
|
||||||
|
r=16,
|
||||||
|
lora_alpha=32,
|
||||||
|
lora_dropout=0.1,
|
||||||
|
task_type=TaskType.CAUSAL_LM,
|
||||||
|
inference_mode=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
|
||||||
|
if not tokenizer.pad_token_id:
|
||||||
|
tokenizer.pad_token_id = tokenizer.eos_token_id
|
||||||
|
|
||||||
|
model = AutoGPTQForCausalLM.from_quantized(
|
||||||
|
model_name_or_path,
|
||||||
|
use_triton=True,
|
||||||
|
warmup_triton=False,
|
||||||
|
trainable=True,
|
||||||
|
inject_fused_attention=True,
|
||||||
|
inject_fused_mlp=False
|
||||||
|
)
|
||||||
|
model.warmup_triton()
|
||||||
|
device = model.device
|
||||||
|
model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
|
||||||
|
model.print_trainable_parameters()
|
||||||
|
|
||||||
|
# loading dataset
|
||||||
|
WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
|
||||||
|
WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
|
||||||
|
|
||||||
|
|
||||||
|
def ds_refactor_fn(samples):
|
||||||
|
instruction_data = samples["instruction"]
|
||||||
|
input_data = samples["input"]
|
||||||
|
output_data = samples["output"]
|
||||||
|
|
||||||
|
new_samples = {"prompt": [], "output": []}
|
||||||
|
for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
|
||||||
|
if input_txt:
|
||||||
|
prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
|
||||||
|
else:
|
||||||
|
prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
|
||||||
|
new_samples["prompt"].append(prompt)
|
||||||
|
new_samples["output"].append(output_txt)
|
||||||
|
|
||||||
|
return new_samples
|
||||||
|
|
||||||
|
|
||||||
|
ds = Dataset.from_generator(
|
||||||
|
lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
|
||||||
|
)
|
||||||
|
ds = ds.map(
|
||||||
|
make_data_block,
|
||||||
|
batched=True,
|
||||||
|
batch_size=len(ds),
|
||||||
|
num_proc=1,
|
||||||
|
remove_columns=ds.column_names,
|
||||||
|
keep_in_memory=True,
|
||||||
|
load_from_cache_file=False,
|
||||||
|
fn_kwargs={
|
||||||
|
"prompt_col_name": "prompt",
|
||||||
|
"label_col_name": "output",
|
||||||
|
"tokenizer": tokenizer,
|
||||||
|
"preprocess_fn": ds_refactor_fn,
|
||||||
|
"sample_max_len": args.sample_max_length,
|
||||||
|
"block_max_len": args.block_max_length,
|
||||||
|
"add_eos_token": True,
|
||||||
|
"truncate_prompt": False,
|
||||||
|
"merge_prompt_label": True
|
||||||
|
}
|
||||||
|
)
|
||||||
|
ds = ds.train_test_split(test_size=len(ds) // 10)
|
||||||
|
train_ds, eval_ds = ds["train"], ds["test"]
|
||||||
|
collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
|
||||||
|
train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
|
||||||
|
eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
|
||||||
|
|
||||||
|
# optimizer and lr scheduler
|
||||||
|
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
|
||||||
|
lr_scheduler = get_linear_schedule_with_warmup(
|
||||||
|
optimizer=optimizer,
|
||||||
|
num_warmup_steps=0,
|
||||||
|
num_training_steps=(len(train_dataloader) * num_epochs),
|
||||||
|
)
|
||||||
|
|
||||||
|
# training and evaluation
|
||||||
|
with torch.cuda.amp.autocast():
|
||||||
|
for epoch in range(num_epochs):
|
||||||
|
model.train()
|
||||||
|
total_loss = 0
|
||||||
|
progress_bar = tqdm(train_dataloader)
|
||||||
|
for step, batch in enumerate(progress_bar):
|
||||||
|
batch = {k: v.to(device) for k, v in batch.items()}
|
||||||
|
outputs = model(**batch)
|
||||||
|
loss = outputs.loss
|
||||||
|
total_loss += loss.detach().float()
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
lr_scheduler.step()
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
progress_bar.set_postfix(loss=loss.item())
|
||||||
|
|
||||||
|
model.eval()
|
||||||
|
eval_loss = 0
|
||||||
|
eval_preds = []
|
||||||
|
for step, batch in enumerate(tqdm(eval_dataloader)):
|
||||||
|
batch = {k: v.to(device) for k, v in batch.items()}
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**batch)
|
||||||
|
loss = outputs.loss
|
||||||
|
eval_loss += loss.detach().float()
|
||||||
|
eval_preds.extend(
|
||||||
|
tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
eval_epoch_loss = eval_loss / len(eval_dataloader)
|
||||||
|
eval_ppl = torch.exp(eval_epoch_loss)
|
||||||
|
train_epoch_loss = total_loss / len(train_dataloader)
|
||||||
|
train_ppl = torch.exp(train_epoch_loss)
|
||||||
|
print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
|
||||||
|
|
||||||
|
model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))
|
108
setup.py
108
setup.py
|
@ -4,29 +4,30 @@ import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
try:
|
|
||||||
import torch
|
|
||||||
TORCH_AVAILABLE = True
|
|
||||||
except ImportError:
|
|
||||||
TORCH_AVAILABLE = False
|
|
||||||
|
|
||||||
IN_GITHUB_ACTIONS = os.environ.get("GITHUB_ACTIONS", "false") == "true"
|
|
||||||
|
|
||||||
python_min_version = (3, 8, 0)
|
python_min_version = (3, 8, 0)
|
||||||
python_min_version_str = '.'.join(map(str, python_min_version))
|
python_min_version_str = '.'.join(map(str, python_min_version))
|
||||||
if sys.version_info < python_min_version:
|
if sys.version_info < python_min_version:
|
||||||
print(f"You are using Python {platform.python_version()}. Python >={python_min_version_str} is required.")
|
print(f"You are using Python {platform.python_version()}. Python >={python_min_version_str} is required.")
|
||||||
sys.exit(-1)
|
sys.exit(-1)
|
||||||
|
|
||||||
CUDA_VERSION = "".join(os.environ.get("CUDA_VERSION", "").split("."))
|
BUILD_CUDA_EXT = int(os.environ.get('BUILD_CUDA_EXT', '1')) == 1
|
||||||
|
|
||||||
|
if BUILD_CUDA_EXT:
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
except:
|
||||||
|
print("torch is not installed, please install torch first!")
|
||||||
|
sys.exit(-1)
|
||||||
|
CUDA_VERSION = "".join(torch.version.cuda.split("."))
|
||||||
|
else:
|
||||||
|
CUDA_VERSION = "".join(os.environ.get("CUDA_VERSION", "").split("."))
|
||||||
|
|
||||||
version = "0.2.1" + (f"+cu{CUDA_VERSION}" if CUDA_VERSION and IN_GITHUB_ACTIONS else "")
|
|
||||||
common_setup_kwargs = {
|
common_setup_kwargs = {
|
||||||
"version": version,
|
"version": "0.3.2",
|
||||||
"name": "auto_gptq",
|
"name": "auto_gptq",
|
||||||
"author": "PanQiWei",
|
"author": "PanQiWei",
|
||||||
"description": "An easy-to-use LLMs quantization package with user-friendly apis, based on GPTQ algorithm.",
|
"description": "An easy-to-use LLMs quantization package with user-friendly apis, based on GPTQ algorithm.",
|
||||||
"long_description": (Path(__file__).parent / "README.md").read_text(),
|
"long_description": (Path(__file__).parent / "README.md").read_text(encoding="UTF-8"),
|
||||||
"long_description_content_type": "text/markdown",
|
"long_description_content_type": "text/markdown",
|
||||||
"url": "https://github.com/PanQiWei/AutoGPTQ",
|
"url": "https://github.com/PanQiWei/AutoGPTQ",
|
||||||
"keywords": ["gptq", "quantization", "large-language-models", "pytorch", "transformers"],
|
"keywords": ["gptq", "quantization", "large-language-models", "pytorch", "transformers"],
|
||||||
|
@ -45,6 +46,9 @@ common_setup_kwargs = {
|
||||||
"python_requires": f">={python_min_version_str}"
|
"python_requires": f">={python_min_version_str}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if CUDA_VERSION:
|
||||||
|
common_setup_kwargs['version'] += f"+cu{CUDA_VERSION}"
|
||||||
|
|
||||||
requirements = [
|
requirements = [
|
||||||
"accelerate>=0.19.0",
|
"accelerate>=0.19.0",
|
||||||
"datasets",
|
"datasets",
|
||||||
|
@ -52,54 +56,50 @@ requirements = [
|
||||||
"rouge",
|
"rouge",
|
||||||
"torch>=1.13.0",
|
"torch>=1.13.0",
|
||||||
"safetensors",
|
"safetensors",
|
||||||
"transformers>=4.26.1"
|
"transformers>=4.31.0",
|
||||||
|
"peft"
|
||||||
]
|
]
|
||||||
|
|
||||||
extras_require = {
|
extras_require = {
|
||||||
"llama": ["transformers>=4.28.0"],
|
|
||||||
"triton": ["triton>=2.0.0"]
|
"triton": ["triton>=2.0.0"]
|
||||||
}
|
}
|
||||||
|
|
||||||
include_dirs = ["autogptq_cuda"]
|
include_dirs = ["autogptq_cuda"]
|
||||||
|
|
||||||
if TORCH_AVAILABLE:
|
additional_setup_kwargs = dict()
|
||||||
BUILD_CUDA_EXT = int(os.environ.get('BUILD_CUDA_EXT', '1')) == 1
|
if BUILD_CUDA_EXT:
|
||||||
|
from torch.utils import cpp_extension
|
||||||
additional_setup_kwargs = dict()
|
from distutils.sysconfig import get_python_lib
|
||||||
if BUILD_CUDA_EXT and (torch.cuda.is_available() or IN_GITHUB_ACTIONS):
|
conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include")
|
||||||
from torch.utils import cpp_extension
|
if os.path.isdir(conda_cuda_include_dir):
|
||||||
from distutils.sysconfig import get_python_lib
|
include_dirs.append(conda_cuda_include_dir)
|
||||||
conda_cuda_include_dir=os.path.join(get_python_lib(),"nvidia/cuda_runtime/include")
|
print(f"appending conda cuda include dir {conda_cuda_include_dir}")
|
||||||
if os.path.isdir(conda_cuda_include_dir):
|
extensions = [
|
||||||
include_dirs.append(conda_cuda_include_dir)
|
cpp_extension.CUDAExtension(
|
||||||
print(f"appending conda cuda include dir {conda_cuda_include_dir}")
|
"autogptq_cuda_64",
|
||||||
extensions = [
|
[
|
||||||
cpp_extension.CUDAExtension(
|
"autogptq_cuda/autogptq_cuda_64.cpp",
|
||||||
"autogptq_cuda",
|
"autogptq_cuda/autogptq_cuda_kernel_64.cu"
|
||||||
[
|
]
|
||||||
"autogptq_cuda/autogptq_cuda.cpp",
|
),
|
||||||
"autogptq_cuda/autogptq_cuda_kernel.cu"
|
cpp_extension.CUDAExtension(
|
||||||
]
|
"autogptq_cuda_256",
|
||||||
)
|
[
|
||||||
]
|
"autogptq_cuda/autogptq_cuda_256.cpp",
|
||||||
|
"autogptq_cuda/autogptq_cuda_kernel_256.cu"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
additional_setup_kwargs = {
|
additional_setup_kwargs = {
|
||||||
"ext_modules": extensions,
|
"ext_modules": extensions,
|
||||||
"cmdclass": {'build_ext': cpp_extension.BuildExtension}
|
"cmdclass": {'build_ext': cpp_extension.BuildExtension}
|
||||||
}
|
}
|
||||||
common_setup_kwargs.update(additional_setup_kwargs)
|
common_setup_kwargs.update(additional_setup_kwargs)
|
||||||
setup(
|
setup(
|
||||||
packages=find_packages(),
|
packages=find_packages(),
|
||||||
install_requires=requirements,
|
install_requires=requirements,
|
||||||
extras_require=extras_require,
|
extras_require=extras_require,
|
||||||
include_dirs=include_dirs,
|
include_dirs=include_dirs,
|
||||||
**common_setup_kwargs
|
**common_setup_kwargs
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
setup(
|
|
||||||
packages=find_packages(),
|
|
||||||
install_requires=requirements,
|
|
||||||
extras_require=extras_require,
|
|
||||||
include_dirs=include_dirs,
|
|
||||||
**common_setup_kwargs
|
|
||||||
)
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue