Merge branch 'main' into MPT

# Conflicts:
#	auto_gptq/modeling/__init__.py
#	auto_gptq/modeling/_const.py
#	auto_gptq/modeling/auto.py
This commit is contained in:
LaaZa 2023-07-26 20:41:19 +03:00
commit 6ff6bc8dfc
37 changed files with 3503 additions and 258905 deletions

View file

@ -1,4 +1,4 @@
name: Build AutoGPTQ Wheels
name: Build AutoGPTQ Wheels with CUDA
on: workflow_dispatch
@ -51,7 +51,7 @@ jobs:
if ($IsLinux) {$env:LD_LIBRARY_PATH = $env:CONDA_PREFIX + '/lib:' + $env:LD_LIBRARY_PATH}
$env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6+PTX'
if ([decimal]$env:CUDA_VERSION -ge 11.8) { $env:TORCH_CUDA_ARCH_LIST = '6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX' }
python -m build -n
python setup.py sdist bdist_wheel
- uses: actions/upload-artifact@v3
if: runner.os == 'Linux'
@ -64,37 +64,3 @@ jobs:
with:
name: 'windows-wheels'
path: ./dist/*.whl
build_sdist:
name: Build source distribution
runs-on: ubuntu-latest
defaults:
run:
shell: pwsh
steps:
- uses: actions/checkout@v3
with:
ref: 'main'
- uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install Dependencies
run: |
python -m pip install --upgrade build setuptools wheel
- name: Build Wheel
run: |
python -m build -n
- uses: actions/upload-artifact@v3
with:
name: 'sdist'
path: ./dist/*.tar.gz
- uses: actions/upload-artifact@v3
with:
name: 'no-cuda-wheel'
path: ./dist/*.whl

View file

@ -12,14 +12,15 @@
<p>
<b>English</b> |
<a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README_zh.md">中文</a>
<p>
</p>
</h4>
*<center>📣 Long time no see! 👋 Architecture upgrade, performance optimization and more new features will come in July and August, stay tune! 🥂</center>*
## News or Update
**To experience adapter training using `auto_gptq` quantized model in advance, you can try [this branch](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration) and discuss [in here](https://github.com/PanQiWei/AutoGPTQ/issues/103), examples are [in here](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration/examples/peft).**
- 2023-05-25 - (In Progress) - Integrate with 🤗 peft to use gptq quantized model to train adapters, support LoRA, AdaLoRA, AdaptionPrompt, etc.
- 2023-07-26 - (Update) - An elegant [PPL benchmark script](examples/benchmark/perplexity.py) to get results that can be fairly compared with other libraries such as `llama.cpp`.
- 2023-06-05 - (Update) - Integrate with 🤗 peft to use gptq quantized model to train adapters, support LoRA, AdaLoRA, AdaptionPrompt, etc.
- 2023-05-30 - (Update) - Support download/upload quantized model from/to 🤗 Hub.
- 2023-05-27 - (Update) - Support quantization and inference for `gpt_bigcode`, `codegen` and `RefineWeb/RefineWebModel`(falcon) model types.
- 2023-05-04 - (Update) - Support using faster cuda kernel when `not desc_act or group_size == -1`.
@ -69,11 +70,7 @@ And to make sure `autogptq_cuda` is not ever in your virtual environment, run:
```shell
pip uninstall autogptq_cuda -y
```
#### to support LLaMa model
For some people want to try LLaMa and whose `transformers` version not meet the newest one that supports it, using:
```shell
pip install auto-gptq[llama]
```
#### to support triton speedup
To integrate with `triton`, using:
> warning: currently triton only supports linux; 3-bit quantization is not supported when using triton
@ -96,8 +93,6 @@ pip install .
```
Like quick installation, you can also set `BUILD_CUDA_EXT=0` to disable pytorch extension building.
Use `.[llama]` if you want to try LLaMa model.
Use `.[triton]` if you want to integrate with triton and it's available on your operating system.
</details>
@ -304,18 +299,18 @@ print(
>
> for example, model_type of `WizardLM`, `vicuna` and `gpt4all` are all `llama`, hence they are all supported by `auto_gptq`.
| model type | quantization | inference | peft-lora | peft-adaption_prompt |
|------------------------------------|--------------|-----------|-----------|----------------------|
| bloom | ✅ | ✅ | | |
| gpt2 | ✅ | ✅ | | |
| gpt_neox | ✅ | ✅ | | |
| gptj | ✅ | ✅ | | |
| llama | ✅ | ✅ | | ✅ |
| moss | ✅ | ✅ | | |
| opt | ✅ | ✅ | | |
| gpt_bigcode | ✅ | ✅ | | |
| codegen | ✅ | ✅ | | |
| falcon(RefinedWebModel/RefinedWeb) | ✅ | ✅ | | |
| model type | quantization | inference | peft-lora | peft-ada-lora | peft-adaption_prompt |
|------------------------------------|--------------|-----------|-----------|---------------|-------------------------------------------------------------------------------------------------|
| bloom | ✅ | ✅ | ✅ | ✅ | |
| gpt2 | ✅ | ✅ | ✅ | ✅ | |
| gpt_neox | ✅ | ✅ | ✅ | ✅ | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
| gptj | ✅ | ✅ | ✅ | ✅ | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
| llama | ✅ | ✅ | ✅ | ✅ | ✅ |
| moss | ✅ | ✅ | ✅ | ✅ | ✅[requires this peft branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
| opt | ✅ | ✅ | ✅ | ✅ | |
| gpt_bigcode | ✅ | ✅ | ✅ | ✅ | |
| codegen | ✅ | ✅ | ✅ | ✅ | |
| falcon(RefinedWebModel/RefinedWeb) | ✅ | ✅ | ✅ | ✅ | |
## Supported Evaluation Tasks
Currently, `auto_gptq` supports: `LanguageModelingTask`, `SequenceClassificationTask` and `TextSummarizationTask`; more Tasks will come soon!

View file

@ -12,14 +12,15 @@
<p>
<a href="https://github.com/PanQiWei/AutoGPTQ/blob/main/README.md">English</a> |
<b>中文</b>
<p>
</p>
</h4>
*<center>📣 好久不见!👋 七月和八月将会迎来架构升级,性能优化和新特性,敬请关注!🥂</center>*
## 新闻或更新
**提前体验使用 `auto_gptq` 量化过的模型来训练适应层,你可以尝试[这个分支](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration) 并在[这里](https://github.com/PanQiWei/AutoGPTQ/issues/103)进行讨论,你也可以参考[这里](https://github.com/PanQiWei/AutoGPTQ/tree/peft_integration/examples/peft)所提供的示例脚本。**
- 2023-05-25 - (开发中) - 集成 🤗 peft 来使用 gptq 量化过的模型训练适应层,支持 LoRAAdaLoRAAdaptionPrompt 等。
- 2023-07-26 - (更新) - 一个优雅的 [PPL 测评脚本](examples/benchmark/perplexity.py)以获得可以与诸如 `llama.cpp` 等代码库进行公平比较的结果。
- 2023-06-05 - (更新) - 集成 🤗 peft 来使用 gptq 量化过的模型训练适应层,支持 LoRAAdaLoRAAdaptionPrompt 等。
- 2023-05-30 - (更新) - 支持从 🤗 Hub 下载量化好的模型或上次量化好的模型到 🤗 Hub。
- 2023-05-27 - (更新) - 支持以下模型的量化和推理: `gpt_bigcode` `codegen` 以及 `RefineWeb/RefineWebModel`falcon
- 2023-05-04 - (更新) - 支持在 `not desc_act or group_size == -1` 的情况下使用更快的 cuda 算子。
@ -69,11 +70,7 @@ BUILD_CUDA_EXT=0 pip install auto-gptq
```shell
pip uninstall autogptq_cuda -y
```
#### 支持使用 LLaMa 模型
若想要尝试 LLaMa 模型,但 `transformers` 版本不为支持该模型的最新版本,使用以下命令:
```shell
pip install auto-gptq[llama]
```
#### 支持使用 triton 加速
若想使用 `triton` 加速模型推理,使用以下命令:
> 警告:目前 triton 仅支持 linux 操作系统;当使用 triton 时 3-bit 数值类型的量化将不被支持
@ -96,8 +93,6 @@ pip install .
```
正如在快速安装一节,你可以使用 `BUILD_CUDA_EXT=0` 来取消构建 cuda 拓展。
如果你想要使用 LLaMa 模型,请使用 `.[llama]`
如果你想要使用 triton 加速且其能够被你的操作系统所支持,请使用 `.[triton]`
</details>
@ -303,18 +298,18 @@ print(
>
> 比如, `WizardLM``vicuna``gpt4all` 模型的 `model_type` 皆为 `llama` 因此这些模型皆被 `auto_gptq` 所支持。
| model type | quantization | inference | peft-lora | peft-adaption_prompt |
|------------------------------------|--------------|-----------|-----------|----------------------|
| bloom | ✅ | ✅ | | |
| gpt2 | ✅ | ✅ | | |
| gpt_neox | ✅ | ✅ | | |
| gptj | ✅ | ✅ | | |
| llama | ✅ | ✅ | | ✅ |
| moss | ✅ | ✅ | | |
| opt | ✅ | ✅ | | |
| gpt_bigcode | ✅ | ✅ | | |
| codegen | ✅ | ✅ | | |
| falcon(RefinedWebModel/RefinedWeb) | ✅ | ✅ | | |
| model type | quantization | inference | peft-lora | peft-ada-lora | peft-adaption_prompt |
|------------------------------------|--------------|-----------|-----------|---------------|-----------------------------------------------------------------------------------|
| bloom | ✅ | ✅ | ✅ | ✅ | |
| gpt2 | ✅ | ✅ | ✅ | ✅ | |
| gpt_neox | ✅ | ✅ | ✅ | ✅ | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
| gptj | ✅ | ✅ | ✅ | ✅ | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
| llama | ✅ | ✅ | ✅ | ✅ | ✅ |
| moss | ✅ | ✅ | ✅ | ✅ | ✅[要求该分支的 peft](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt) |
| opt | ✅ | ✅ | ✅ | ✅ | |
| gpt_bigcode | ✅ | ✅ | ✅ | ✅ | |
| codegen | ✅ | ✅ | ✅ | ✅ | |
| falcon(RefinedWebModel/RefinedWeb) | ✅ | ✅ | ✅ | ✅ | |
## 支持的评估任务
目前, `auto_gptq` 支持以下评估任务: `LanguageModelingTask`, `SequenceClassificationTask``TextSummarizationTask`;更多的评估任务即将到来!

View file

@ -1,2 +1,4 @@
__version__ = "0.3.2"
from .modeling import BaseQuantizeConfig
from .modeling import AutoGPTQForCausalLM
from .utils.peft_utils import get_gptq_peft_model

View file

@ -10,4 +10,6 @@ from .opt import *
from .rw import *
from .gpt_bigcode import *
from .codegen import *
from .baichuan import *
from .internlm import *
from .mpt import *

View file

@ -20,10 +20,11 @@ from transformers.modeling_utils import no_init_weights
from ._const import *
from ._utils import *
from ..nn_modules.qlinear import GeneralQuantLinear
from ..nn_modules._fused_base import FusedBaseAttentionModule, FusedBaseMLPModule
from ..quantization import GPTQ
from ..utils.data_utils import collate_data
from ..utils.import_utils import TRITON_AVAILABLE
from ..utils.import_utils import dynamically_import_QuantLinear, TRITON_AVAILABLE, AUTOGPTQ_CUDA_AVAILABLE
logger = getLogger(__name__)
@ -112,7 +113,16 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
fused_attn_module_type: Optional[FusedBaseAttentionModule] = None
fused_mlp_module_type: Optional[FusedBaseMLPModule] = None
def __init__(self, model: PreTrainedModel, quantized: bool, quantize_config: BaseQuantizeConfig):
def __init__(
self,
model: PreTrainedModel,
quantized: bool,
quantize_config: BaseQuantizeConfig,
is_triton_backend: bool = False,
injected_fused_attention: bool = False,
injected_fused_mlp: bool = False,
trainable: bool = False
):
super().__init__()
self.model = model
@ -121,6 +131,11 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
self.quantize_config = quantize_config
self.config = self.model.config
self.is_triton_backend = is_triton_backend
self.injected_fused_attention = injected_fused_attention
self.injected_fused_mlp = injected_fused_mlp
self.trainable = trainable
@property
def quantized(self):
return self._quantized
@ -431,6 +446,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
repo_id: str,
save_dir: Optional[str] = None,
use_safetensors: Optional[bool] = True,
safetensors_metadata: Optional[Dict[str, str]] = None,
commit_message: Optional[str] = "Upload of AutoGPTQ quantized model",
use_auth_token: Optional[Union[bool, str]] = None,
private: Optional[bool] = None,
@ -450,6 +466,10 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
use_safetensors (`bool`, *optional*):
Save the model using `safetensors`.
If the model has already been saved, this parameter can be omitted.
safetensors_metadata: (`dict`, *optional*, defaults to `None`):
Pass optional metadata dictionary to be saved in the `safetensors` model file(s).
Metadata is optional and is purely for informational purposes. It does not affect inference.
If `None`, no metadata will be saved.
commit_message (`str`, *optional*, defaults to `"Upload tool"`):
Message to commit while pushing.
use_auth_token (`bool` or `str`, *optional*):
@ -469,7 +489,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
if save_dir is not None:
logger.info(f"Saving model to {save_dir}")
self.save_quantized(save_dir, use_safetensors)
self.save_quantized(save_dir, use_safetensors, safetensors_metadata)
repo_url = create_repo(
repo_id=repo_id, token=token, private=private, exist_ok=True, repo_type="model"
@ -492,7 +512,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
repo_type="model",
)
def save_quantized(self, save_dir: str, use_safetensors: bool = False):
def save_quantized(self, save_dir: str, use_safetensors: bool = False, safetensors_metadata: Optional[Dict[str, str]] = None):
"""save quantized model and configs to local disk"""
os.makedirs(save_dir, exist_ok=True)
@ -506,7 +526,42 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
model_save_name = model_base_name + ".safetensors"
state_dict = self.model.state_dict()
state_dict = {k: v.clone().contiguous() for k, v in state_dict.items()}
safe_save(state_dict, join(save_dir, model_save_name))
if safetensors_metadata is None:
safetensors_metadata = {}
elif not isinstance(safetensors_metadata, dict):
raise TypeError("safetensors_metadata must be a dictionary.")
else:
logger.debug(f"Received safetensors_metadata: {safetensors_metadata}")
new_safetensors_metadata = {}
converted_keys = False
for key, value in safetensors_metadata.items():
if not isinstance(key, str) or not isinstance(value, str):
converted_keys = True
try:
new_key = str(key)
new_value = str(value)
except Exception as e:
raise TypeError(f"safetensors_metadata: both keys and values must be strings and an error occured when trying to convert them: {e}")
if new_key in new_safetensors_metadata:
logger.warning(f"After converting safetensors_metadata keys to strings, the key '{new_key}' is duplicated. Ensure that all your metadata keys are strings to avoid overwriting.")
new_safetensors_metadata[new_key] = new_value
safetensors_metadata = new_safetensors_metadata
if converted_keys:
logger.debug(f"One or more safetensors_metadata keys or values had to be converted to str(). Final safetensors_metadata: {safetensors_metadata}")
# Format is required to enable Accelerate to load the metadata
# otherwise it raises an OSError
safetensors_metadata['format'] = "pt"
# Store the quantization configuration as safetensors metadata
from auto_gptq import __version__
safetensors_metadata['auto_gptq_version'] = str(__version__)
safetensors_metadata['gptq_bits'] = str(self.quantize_config.bits)
safetensors_metadata['gptq_group_size'] = str(self.quantize_config.group_size)
safetensors_metadata['gptq_desc_act'] = str(self.quantize_config.desc_act)
safetensors_metadata['gptq_damp_percent'] = str(self.quantize_config.damp_percent)
safe_save(state_dict, join(save_dir, model_save_name), safetensors_metadata)
else:
model_save_name = model_base_name + ".bin"
torch.save(self.model.state_dict(), join(save_dir, model_save_name))
@ -516,10 +571,10 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
self.quantize_config.model_name_or_path = save_dir
self.quantize_config.model_file_base_name = model_base_name
def save_pretrained(self, save_dir: str, use_safetensors: bool = False, **kwargs):
def save_pretrained(self, save_dir: str, use_safetensors: bool = False, safetensors_metadata: Optional[Dict[str, str]] = None, **kwargs):
"""alias of save_quantized"""
logger.warning("you are using save_pretrained, which will re-direct to save_quantized.")
self.save_quantized(save_dir, use_safetensors)
self.save_quantized(save_dir, use_safetensors, safetensors_metadata)
@classmethod
def from_pretrained(
@ -543,7 +598,29 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
torch.nn.init.uniform_ = skip
torch.nn.init.normal_ = skip
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
# Parameters related to loading from Hugging Face Hub
cache_dir = model_init_kwargs.pop("cache_dir", None)
force_download = model_init_kwargs.pop("force_download", False)
resume_download = model_init_kwargs.pop("resume_download", False)
proxies = model_init_kwargs.pop("proxies", None)
local_files_only = model_init_kwargs.pop("local_files_only", False)
use_auth_token = model_init_kwargs.pop("use_auth_token", None)
revision = model_init_kwargs.pop("revision", None)
subfolder = model_init_kwargs.pop("subfolder", "")
commit_hash = model_init_kwargs.pop("_commit_hash", None)
cached_file_kwargs = {
"cache_dir": cache_dir,
"force_download": force_download,
"proxies": proxies,
"resume_download": resume_download,
"local_files_only": local_files_only,
"use_auth_token": use_auth_token,
"revision": revision,
"subfolder": subfolder,
}
config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True, **cached_file_kwargs)
if config.model_type not in SUPPORTED_MODELS:
raise TypeError(f"{config.model_type} isn't supported yet.")
@ -579,7 +656,9 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
torch.cuda.empty_cache()
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **model_init_kwargs)
merged_kwargs = {**model_init_kwargs, **cached_file_kwargs}
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **merged_kwargs)
model_config = model.config.to_dict()
seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]
if any([k in model_config for k in seq_len_keys]):
@ -597,8 +676,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
@classmethod
def from_quantized(
cls,
model_name_or_path: Optional[str] = None,
save_dir: Optional[str] = None,
model_name_or_path: Optional[str],
device_map: Optional[Union[str, Dict[str, Union[int, str]]]] = None,
max_memory: Optional[dict] = None,
device: Optional[Union[str, int]] = None,
@ -613,6 +691,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
use_safetensors: bool = False,
trust_remote_code: bool = False,
warmup_triton: bool = False,
trainable: bool = False,
**kwargs
):
"""load quantized model from local disk"""
@ -628,20 +707,25 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
subfolder = kwargs.pop("subfolder", "")
commit_hash = kwargs.pop("_commit_hash", None)
cached_file_kwargs = {
"cache_dir": cache_dir,
"force_download": force_download,
"proxies": proxies,
"resume_download": resume_download,
"local_files_only": local_files_only,
"use_auth_token": use_auth_token,
"revision": revision,
"subfolder": subfolder,
"_raise_exceptions_for_missing_entries": False,
"_commit_hash": commit_hash,
}
if use_triton and not TRITON_AVAILABLE:
logger.warning("triton is not installed, reset use_triton to False")
use_triton = False
# == step1: prepare configs and file names == #
if model_name_or_path and save_dir:
logger.warning("save_dir will be ignored because model_name_or_path is explicit specified.")
if not model_name_or_path and save_dir:
model_name_or_path = save_dir
warnings.warn("save_dir is deprecated and will be removed in version 0.3.0", PendingDeprecationWarning, stacklevel=2)
if not model_name_or_path and not save_dir:
raise ValueError("at least one of model_name_or_path or save_dir should be specified.")
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code, **cached_file_kwargs)
if config.model_type not in SUPPORTED_MODELS:
raise TypeError(f"{config.model_type} isn't supported yet.")
@ -670,25 +754,11 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
resolved_archive_file = None
if is_local:
model_save_name = join(model_name_or_path, model_basename)
for ext in extensions:
if isfile(model_save_name + ext):
resolved_archive_file = model_save_name + ext
break
else: # remote
cached_file_kwargs = {
"cache_dir": cache_dir,
"force_download": force_download,
"proxies": proxies,
"resume_download": resume_download,
"local_files_only": local_files_only,
"use_auth_token": use_auth_token,
"revision": revision,
"subfolder": subfolder,
"_raise_exceptions_for_missing_entries": False,
"_commit_hash": commit_hash,
}
for ext in extensions:
resolved_archive_file = cached_file(model_name_or_path, model_basename + ext, **cached_file_kwargs)
if resolved_archive_file is not None:
@ -699,6 +769,9 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
model_save_name = resolved_archive_file
if not use_triton and trainable:
logger.warning("QuantLinear with cuda backend not support trainable mode yet, Switch to the pytorch backend.")
# == step2: convert model to gptq-model (replace Linear with QuantLinear) == #
def skip(*args, **kwargs):
pass
@ -734,7 +807,8 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
quantize_config.group_size,
use_triton=use_triton,
use_cuda_fp16=use_cuda_fp16,
desc_act=quantize_config.desc_act
desc_act=quantize_config.desc_act,
trainable=trainable
)
model.tie_weights()
@ -794,6 +868,7 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
# == step5: (optional) inject optimized module == #
if inject_fused_attention:
if cls.fused_attn_module_type is None:
inject_fused_attention = False
logger.warning(f"{cls.__name__} hasn't fused attention module yet, will skip inject fused attention.")
else:
cls.fused_attn_module_type.inject_to_model(
@ -801,10 +876,12 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
use_triton=use_triton,
group_size=quantize_config.group_size,
use_cuda_fp16=use_cuda_fp16,
desc_act=quantize_config.desc_act
desc_act=quantize_config.desc_act,
trainable=trainable
)
if inject_fused_mlp:
if cls.fused_mlp_module_type is None:
inject_fused_mlp = False
logger.warning(f"{cls.__name__} hasn't fused mlp module yet, will skip inject fused mlp.")
else:
cls.fused_mlp_module_type.inject_to_model(
@ -815,13 +892,26 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
model.eval()
# == step6: (optional) warmup triton == #
if use_triton and warmup_triton:
from ..nn_modules.qlinear_triton import QuantLinear
from ..nn_modules.qlinear.qlinear_triton import QuantLinear
QuantLinear.warmup(model, seqlen=model.seqlen)
if inject_fused_mlp and cls.fused_mlp_module_type is not None:
cls.fused_mlp_module_type.warmup(model, seqlen=model.seqlen)
return cls(model, True, quantize_config)
# == step7: make model compatible with peft
cls.make_sure_compatible_with_peft(
model, use_triton, quantize_config.desc_act, quantize_config.group_size
)
return cls(
model,
True,
quantize_config,
is_triton_backend=use_triton,
injected_fused_attention=inject_fused_attention,
injected_fused_mlp=inject_fused_mlp and use_triton,
trainable=trainable
)
def warmup_triton(self, enabled: bool = True):
if not enabled:
@ -830,11 +920,34 @@ class BaseGPTQForCausalLM(nn.Module, PushToHubMixin):
logger.warning(f"triton is not available, skip warmup stage directly.")
return
from ..nn_modules.qlinear_triton import QuantLinear
from ..nn_modules.qlinear.qlinear_triton import QuantLinear
QuantLinear.warmup(self.model, seqlen=self.model.seqlen)
if self.fused_mlp_module_type is not None:
self.fused_mlp_module_type.warmup(self.model, seqlen=self.model.seqlen)
def enable_trainable_mode(self, enabled: bool = True):
if not self.is_triton_backend and enabled:
raise NotImplementedError("For now, trainable mode only supports triton backend.")
for n, m in self.model.named_modules():
if hasattr(m, "trainable"):
setattr(m, "trainable", enabled)
def disable_trainable_mode(self):
self.enable_trainable_mode(enabled=False)
@staticmethod
def make_sure_compatible_with_peft(model: PreTrainedModel, use_triton: bool, desc_act: bool, group_size: int):
GeneralQuantLinear.inject_to_model(
model,
dynamically_import_QuantLinear(use_triton, desc_act, group_size)
)
def __getattr__(self, item):
try:
return super().__getattr__(item)
except:
return getattr(self.model, item)
__all__ = ["BaseGPTQForCausalLM", "BaseQuantizeConfig"]

View file

@ -1,12 +1,27 @@
from packaging.version import parse as parse_version
from torch import device
from transformers import __version__ as transformers_version
from ..utils.import_utils import compare_transformers_version
CPU = device("cpu")
CUDA_0 = device("cuda:0")
SUPPORTED_MODELS = ["bloom", "gptj", "gpt2", "gpt_neox", "opt", "moss", "gpt_bigcode", "codegen", "RefinedWebModel", "RefinedWeb", "mpt"]
SUPPORTED_MODELS = [
"bloom",
"gptj",
"gpt2",
"gpt_neox",
"opt",
"moss",
"gpt_bigcode",
"codegen",
"RefinedWebModel",
"RefinedWeb",
"baichuan",
"internlm",
"mpt",
]
if compare_transformers_version("v4.28.0", op="ge"):
SUPPORTED_MODELS.append("llama")

View file

@ -50,7 +50,17 @@ def get_module_by_name_suffix(model, module_name: str):
return module
def make_quant(module, names, bits, group_size, name='', use_triton=False, use_cuda_fp16=True, desc_act=False):
def make_quant(
module,
names,
bits,
group_size,
name='',
use_triton=False,
use_cuda_fp16=True,
desc_act=False,
trainable=False
):
QuantLinear = dynamically_import_QuantLinear(use_triton=use_triton, desc_act=desc_act, group_size=group_size)
if isinstance(module, QuantLinear):
@ -71,13 +81,25 @@ def make_quant(module, names, bits, group_size, name='', use_triton=False, use_c
in_features = tmp.weight.shape[0]
out_features = tmp.weight.shape[1]
if (not(desc_act) or group_size == -1) and not use_triton:
new_layer = QuantLinear(bits, group_size, in_features, out_features, True, use_cuda_fp16=use_cuda_fp16)
new_layer = QuantLinear(
bits, group_size, in_features, out_features, True, use_cuda_fp16=use_cuda_fp16, trainable=trainable
)
else:
new_layer = QuantLinear(bits, group_size, in_features, out_features, True)
new_layer = QuantLinear(bits, group_size, in_features, out_features, True, trainable=trainable)
new_layer.device = ori_layer_device
setattr(module, attr, new_layer.to(ori_layer_device))
for name1, child in module.named_children():
make_quant(child, names, bits, group_size, name + '.' + name1 if name != '' else name1, use_triton=use_triton, use_cuda_fp16=use_cuda_fp16,desc_act=desc_act)
make_quant(
child,
names,
bits,
group_size,
name + '.' + name1 if name != '' else name1,
use_triton=use_triton,
use_cuda_fp16=use_cuda_fp16,
desc_act=desc_act,
trainable=trainable
)
def pack_model(

View file

@ -1,4 +1,5 @@
from typing import Optional
from inspect import signature
from typing import Dict, Optional, Union
from ._base import BaseQuantizeConfig, BaseGPTQForCausalLM
from ._utils import check_and_get_model_type
@ -12,6 +13,8 @@ from .moss import MOSSGPTQForCausalLM
from .opt import OPTGPTQForCausalLM
from .rw import RWGPTQForCausalLM
from .gpt_bigcode import GPTBigCodeGPTQForCausalLM
from .baichuan import BaiChuanGPTQForCausalLM
from .internlm import InternLMGPTQForCausalLM
from .mpt import MPTGPTQForCausalLM
@ -26,8 +29,10 @@ GPTQ_CAUSAL_LM_MODEL_MAP = {
"gpt_bigcode": GPTBigCodeGPTQForCausalLM,
"codegen": CodeGenGPTQForCausalLM,
"RefinedWebModel": RWGPTQForCausalLM,
"RefinedWeb":RWGPTQForCausalLM,
"mpt": MPTGPTQForCausalLM
"RefinedWeb": RWGPTQForCausalLM,
"baichuan": BaiChuanGPTQForCausalLM,
"internlm": InternLMGPTQForCausalLM,
"mpt": MPTGPTQForCausalLM,
}
@ -48,7 +53,9 @@ class AutoGPTQForCausalLM:
trust_remote_code: bool = False,
**model_init_kwargs
) -> BaseGPTQForCausalLM:
model_type = check_and_get_model_type(pretrained_model_name_or_path, trust_remote_code)
model_type = check_and_get_model_type(
pretrained_model_name_or_path, trust_remote_code
)
return GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
quantize_config=quantize_config,
@ -60,8 +67,7 @@ class AutoGPTQForCausalLM:
@classmethod
def from_quantized(
cls,
model_name_or_path: Optional[str] = None,
save_dir: Optional[str] = None,
model_name_or_path: Optional[str],
device_map: Optional[Union[str, Dict[str, Union[str, int]]]] = None,
max_memory: Optional[dict] = None,
device: Optional[Union[str, int]] = None,
@ -75,14 +81,32 @@ class AutoGPTQForCausalLM:
use_safetensors: bool = False,
trust_remote_code: bool = False,
warmup_triton: bool = False,
trainable: bool = False,
**kwargs
) -> BaseGPTQForCausalLM:
model_type = check_and_get_model_type(save_dir or model_name_or_path, trust_remote_code)
model_type = check_and_get_model_type(model_name_or_path, trust_remote_code)
quant_func = GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized
keywords = {key: kwargs[key] for key in signature(quant_func).parameters if key in kwargs}
# A static list of kwargs needed for huggingface_hub
huggingface_kwargs = [
"cache_dir",
"force_download",
"proxies",
"resume_download",
"local_files_only",
"use_auth_token",
"revision",
"subfolder",
"_raise_exceptions_for_missing_entries",
"_commit_hash"
]
# TODO: do we need this filtering of kwargs? @PanQiWei is there a reason we can't just pass all kwargs?
keywords = {
key: kwargs[key]
for key in list(signature(quant_func).parameters.keys()) + huggingface_kwargs
if key in kwargs
}
return quant_func(
model_name_or_path=model_name_or_path,
save_dir=save_dir,
device_map=device_map,
max_memory=max_memory,
device=device,
@ -96,6 +120,7 @@ class AutoGPTQForCausalLM:
use_safetensors=use_safetensors,
trust_remote_code=trust_remote_code,
warmup_triton=warmup_triton,
trainable=trainable,
**keywords
)

View file

@ -0,0 +1,16 @@
from ._base import *
class BaiChuanGPTQForCausalLM(BaseGPTQForCausalLM):
layer_type = "DecoderLayer"
layers_block_name = "model.layers"
outside_layer_modules = ["model.embed_tokens", "model.norm"]
inside_layer_modules = [
["self_attn.W_pack"],
["self_attn.o_proj"],
["mlp.up_proj", "mlp.gate_proj"],
["mlp.down_proj"]
]
__all__ = ["BaiChuanGPTQForCausalLM"]

View file

@ -0,0 +1,16 @@
from ._base import *
class InternLMGPTQForCausalLM(BaseGPTQForCausalLM):
layer_type = "InternLMDecoderLayer"
layers_block_name = "model.layers"
outside_layer_modules = ["model.embed_tokens", "model.norm"]
inside_layer_modules = [
["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
["self_attn.o_proj"],
["mlp.up_proj", "mlp.gate_proj"],
["mlp.down_proj"],
]
__all__ = ["InternLMGPTQForCausalLM"]

View file

@ -18,7 +18,16 @@ class FusedBaseModule(nn.Module, TritonModuleMixin):
class FusedBaseAttentionModule(FusedBaseModule):
@classmethod
@abstractmethod
def inject_to_model(cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, **kwargs):
def inject_to_model(
cls,
model,
use_triton=False,
group_size=-1,
use_cuda_fp16=True,
desc_act=False,
trainable=False,
**kwargs
):
raise NotImplementedError()
@classmethod

View file

@ -226,7 +226,16 @@ class FusedGPTJAttentionForQuantizedModel(FusedBaseAttentionModule):
return outputs # a, present, (attentions)
@classmethod
def inject_to_model(cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, **kwargs):
def inject_to_model(
cls,
model,
use_triton=False,
group_size=-1,
use_cuda_fp16=True,
desc_act=False,
trainable=False,
**kwargs
):
config = model.config
QuantLinear = dynamically_import_QuantLinear(use_triton=use_triton, desc_act=desc_act, group_size=group_size)
@ -253,7 +262,7 @@ class FusedGPTJAttentionForQuantizedModel(FusedBaseAttentionModule):
q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
True if q_proj.bias is not None else False,
)
qlinear_kwargs = dict()
qlinear_kwargs = {"trainable": trainable}
if (not desc_act or group_size == -1) and not use_triton:
qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
qkv_proj = QuantLinear(*qlinear_args, **qlinear_kwargs)

View file

@ -126,7 +126,16 @@ class FusedLlamaAttentionForQuantizedModel(FusedBaseAttentionModule):
return attn_output, attn_weights, past_key_value
@classmethod
def inject_to_model(cls, model, use_triton=False, group_size=-1, use_cuda_fp16=True, desc_act=False, **kwargs):
def inject_to_model(
cls,
model,
use_triton=False,
group_size=-1,
use_cuda_fp16=True,
desc_act=False,
trainable=False,
**kwargs
):
"""
Replace all LlamaAttention modules with QuantLlamaAttention modules, fusing the q, k, v projections.
"""
@ -153,7 +162,7 @@ class FusedLlamaAttentionForQuantizedModel(FusedBaseAttentionModule):
q_proj.outfeatures + k_proj.outfeatures + v_proj.outfeatures,
True if q_proj.bias is not None else False,
)
qlinear_kwargs = dict()
qlinear_kwargs = {"trainable": trainable}
if (not desc_act or group_size == -1) and not use_triton:
qlinear_kwargs["use_cuda_fp16"] = use_cuda_fp16
qkv_layer = QuantLinear(*qlinear_args, **qlinear_kwargs)

View file

@ -237,14 +237,6 @@ class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
up_proj,
):
super().__init__()
self.register_buffer('gate_proj_qweight', gate_proj.qweight)
self.register_buffer('gate_proj_scales', gate_proj.scales)
self.register_buffer('gate_proj_qzeros', gate_proj.qzeros)
self.register_buffer('gate_proj_g_idx', gate_proj.g_idx)
self.register_buffer('up_proj_qweight', up_proj.qweight)
self.register_buffer('up_proj_scales', up_proj.scales)
self.register_buffer('up_proj_qzeros', up_proj.qzeros)
self.register_buffer('up_proj_g_idx', up_proj.g_idx)
self.infeatures = gate_proj.infeatures
self.intermediate_size = gate_proj.outfeatures
@ -252,6 +244,8 @@ class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
self.bits = gate_proj.bits
self.maxq = gate_proj.maxq
self.gate_proj = gate_proj
self.up_proj = up_proj
self.down_proj = down_proj
def forward(self, x):
@ -266,40 +260,20 @@ class FusedLlamaMLPForQuantizedModel(FusedBaseMLPModule):
c = torch.empty((M, N), device=x.device, dtype=torch.float16)
grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), )
quant_fused_matmul_248_kernel[grid](
x, c, self.gate_proj_qweight,
self.gate_proj_scales, self.gate_proj_qzeros, self.gate_proj_g_idx,
self.up_proj_qweight,
self.up_proj_scales, self.up_proj_qzeros, self.up_proj_g_idx,
x, c, self.gate_proj.qweight,
self.gate_proj.scales, self.gate_proj.qzeros, self.gate_proj.g_idx,
self.up_proj.qweight,
self.up_proj.scales, self.up_proj.qzeros, self.up_proj.g_idx,
M, N, K,
self.bits, self.maxq,
x.stride(0), x.stride(1),
self.gate_proj_qweight.stride(0), self.gate_proj_qweight.stride(1),
self.gate_proj.qweight.stride(0), self.gate_proj.qweight.stride(1),
c.stride(0), c.stride(1),
self.gate_proj_scales.stride(0), self.gate_proj_qzeros.stride(0)
self.gate_proj.scales.stride(0), self.gate_proj.qzeros.stride(0)
)
c = c.reshape(out_shape)
return c
def fused2cuda(self):
self.gate_proj_qweight = self.gate_proj_qweight.cuda()
self.gate_proj_scales = self.gate_proj_scales.cuda()
self.gate_proj_qzeros = self.gate_proj_qzeros.cuda()
self.gate_proj_g_idx = self.gate_proj_g_idx.cuda()
self.up_proj_qweight = self.up_proj_qweight.cuda()
self.up_proj_scales = self.up_proj_scales.cuda()
self.up_proj_qzeros = self.up_proj_qzeros.cuda()
self.up_proj_g_idx = self.up_proj_g_idx.cuda()
def fused2cpu(self):
self.gate_proj_qweight = self.gate_proj_qweight.cpu()
self.gate_proj_scales = self.gate_proj_scales.cpu()
self.gate_proj_qzeros = self.gate_proj_qzeros.cpu()
self.gate_proj_g_idx = self.gate_proj_g_idx.cpu()
self.up_proj_qweight = self.up_proj_qweight.cpu()
self.up_proj_scales = self.up_proj_scales.cpu()
self.up_proj_qzeros = self.up_proj_qzeros.cpu()
self.up_proj_g_idx = self.up_proj_g_idx.cpu()
@classmethod
def inject_to_model(cls, model, use_triton=False, **kwargs):
if not use_triton:

View file

@ -0,0 +1,57 @@
import torch.nn as nn
class GeneralQuantLinear(nn.Linear):
def __init__(self, quant_linear_module):
super().__init__(
in_features=quant_linear_module.infeatures,
out_features=quant_linear_module.outfeatures,
bias=True
)
self.infeatures = quant_linear_module.infeatures
self.outfeatures = quant_linear_module.outfeatures
self.bits = quant_linear_module.bits
self.group_size = quant_linear_module.group_size
self.maxq = quant_linear_module.maxq
self.weight.requires_grad = False
self.weight.data = quant_linear_module.qweight
self.qweight = self.weight
self.bias.data = quant_linear_module.bias
self.qweight.requires_grad = False
self.bias.requires_grad = False
self.qzeros = quant_linear_module.qzeros
self.scales = quant_linear_module.scales
self.g_idx = quant_linear_module.g_idx
if hasattr(quant_linear_module, "wf"):
self.wf = quant_linear_module.wf
if hasattr(quant_linear_module, "kernel_switch_threshold"):
self.kernel_switch_threshold = quant_linear_module.kernel_switch_threshold
if hasattr(quant_linear_module, "autogptq_cuda_available"):
self.autogptq_cuda_available = quant_linear_module.autogptq_cuda_available
self.trainable = quant_linear_module.trainable
self.forward = quant_linear_module.forward
@classmethod
def inject_to_model(cls, model, target_module_type):
for name, m in model.named_modules():
if not isinstance(m, target_module_type):
continue
new_m = cls(m)
if '.' in name:
parent_name = name.rsplit('.', 1)[0]
child_name = name[len(parent_name) + 1:]
parent = model.get_submodule(parent_name)
else:
parent_name = ''
parent = model
child_name = name
setattr(parent, child_name, new_m)

View file

@ -9,11 +9,13 @@ import transformers
logger = getLogger(__name__)
try:
import autogptq_cuda
import autogptq_cuda_256
import autogptq_cuda_64
_autogptq_cuda_available = True
except ImportError:
logger.warning('CUDA extension not installed.')
autogptq_cuda_256 = None
autogptq_cuda_64 = None
_autogptq_cuda_available = False
@ -26,10 +28,14 @@ class QuantLinear(nn.Module):
outfeatures,
bias,
kernel_switch_threshold=128,
trainable=False
):
super().__init__()
global _autogptq_cuda_available
if bits not in [2, 3, 4, 8]:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
if trainable:
_autogptq_cuda_available = False
self.infeatures = infeatures
self.outfeatures = outfeatures
@ -73,9 +79,15 @@ class QuantLinear(nn.Module):
self.kernel_switch_threshold = kernel_switch_threshold
self.autogptq_cuda_available = _autogptq_cuda_available
self.autogptq_cuda = autogptq_cuda_256
if infeatures % 256 != 0 or outfeatures % 256 != 0:
self.autogptq_cuda = autogptq_cuda_64
if infeatures % 64 != 0 or outfeatures % 64 != 0:
self.autogptq_cuda_available = False
self.trainable = trainable
def pack(self, linear, scales, zeros, g_idx=None):
W = linear.weight.data.clone()
if isinstance(linear, nn.Conv2d):
@ -184,13 +196,13 @@ class QuantLinear(nn.Module):
):
out = torch.zeros((x.shape[0], self.outfeatures), device=x.device, dtype=torch.float32)
if self.bits == 2:
autogptq_cuda.vecquant2matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
self.autogptq_cuda.vecquant2matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
elif self.bits == 3:
autogptq_cuda.vecquant3matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
self.autogptq_cuda.vecquant3matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
elif self.bits == 4:
autogptq_cuda.vecquant4matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
self.autogptq_cuda.vecquant4matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
elif self.bits == 8:
autogptq_cuda.vecquant8matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
self.autogptq_cuda.vecquant8matmul(x.float(), self.qweight, out, self.scales.float(), self.qzeros, self.g_idx)
else:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
else:

View file

@ -7,15 +7,17 @@ import torch.nn as nn
import transformers
logger = getLogger(__name__)
try:
import autogptq_cuda
import autogptq_cuda_256
import autogptq_cuda_64
_autogptq_cuda_available = True
except ImportError:
logger.warning('CUDA extension not installed.')
autogptq_cuda_256 = None
autogptq_cuda_64 = None
_autogptq_cuda_available = False
class QuantLinear(nn.Module):
def __init__(
self,
@ -25,12 +27,15 @@ class QuantLinear(nn.Module):
outfeatures,
bias,
use_cuda_fp16=True,
kernel_switch_threshold=128
kernel_switch_threshold=128,
trainable=False
):
super().__init__()
global _autogptq_cuda_available
if bits not in [2, 3, 4, 8]:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
if trainable:
_autogptq_cuda_available = False
self.infeatures = infeatures
self.outfeatures = outfeatures
self.bits = bits
@ -77,10 +82,21 @@ class QuantLinear(nn.Module):
self.kernel_switch_threshold = kernel_switch_threshold
self.autogptq_cuda_available = _autogptq_cuda_available
self.autogptq_cuda = autogptq_cuda_256
if infeatures % 256 != 0 or outfeatures % 256 != 0:
self.autogptq_cuda = autogptq_cuda_64
if infeatures % 64 != 0 or outfeatures % 64 != 0:
self.autogptq_cuda_available = False
self.trainable = trainable
def pack(self, linear, scales, zeros, g_idx):
W = linear.weight.data.clone()
if isinstance(linear, nn.Conv2d):
W = W.flatten(1)
if isinstance(linear, transformers.pytorch_utils.Conv1D):
W = W.t()
scales = scales.t().contiguous()
zeros = zeros.t().contiguous()
scale_zeros = zeros * scales
@ -93,7 +109,7 @@ class QuantLinear(nn.Module):
g_idx = idx // self.group_size
intweight.append(
torch.round(
(linear.weight.data[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]
(W[:, idx] + scale_zeros[g_idx]) / self.scales[g_idx]
).to(torch.int)[:, None]
)
intweight = torch.cat(intweight, dim=1)
@ -182,24 +198,24 @@ class QuantLinear(nn.Module):
if self.use_cuda_fp16:
x = x.half()
if self.bits == 2:
autogptq_cuda.vecquant2matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
self.autogptq_cuda.vecquant2matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
elif self.bits == 3:
autogptq_cuda.vecquant3matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
self.autogptq_cuda.vecquant3matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
elif self.bits == 4:
autogptq_cuda.vecquant4matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
self.autogptq_cuda.vecquant4matmul_faster_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size, self.half_indim)
else:
raise NotImplementedError("Only 2,3,4 bits are supported.")
else:
x = x.float()
if self.bits == 2:
autogptq_cuda.vecquant2matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
self.autogptq_cuda.vecquant2matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
elif self.bits == 3:
autogptq_cuda.vecquant3matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
self.autogptq_cuda.vecquant3matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
elif self.bits == 4:
autogptq_cuda.vecquant4matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
self.autogptq_cuda.vecquant4matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
elif self.bits == 8:
autogptq_cuda.vecquant8matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
self.autogptq_cuda.vecquant8matmul_old(x, self.qweight, out, self.scales.float(), self.qzeros, self.group_size)
else:
raise NotImplementedError("Only 2,3,4,8 bits are supported.")
else:

View file

@ -1,17 +1,20 @@
import math
from logging import getLogger
import numpy as np
import torch
import torch.nn as nn
import transformers
from torch.cuda.amp import custom_bwd, custom_fwd
from logging import getLogger
from .triton_utils.mixin import TritonModuleMixin
from ..triton_utils.mixin import TritonModuleMixin
logger = getLogger(__name__)
try:
from .triton_utils.kernels import quant_matmul_248, transpose_quant_matmul_248, QuantLinearFunction
from ..triton_utils.kernels import (
quant_matmul_248, transpose_quant_matmul_248, quant_matmul_inference_only_248,
QuantLinearFunction, QuantLinearInferenceOnlyFunction
)
except ImportError:
logger.error('triton not installed.')
raise
@ -24,13 +27,14 @@ class QuantLinear(nn.Module, TritonModuleMixin):
group_size,
infeatures,
outfeatures,
bias
bias,
trainable=False
):
super().__init__()
if bits not in [2, 4, 8]:
raise NotImplementedError("Only 2,4,8 bits are supported.")
if infeatures % 256 != 0 or outfeatures % 256 != 0:
raise NotImplementedError("in_feature or out_feature must be divisible by 256.")
if infeatures % 32 != 0 or outfeatures % 32 != 0:
raise NotImplementedError("in_feature and out_feature must be divisible by 32.")
self.infeatures = infeatures
self.outfeatures = outfeatures
self.bits = bits
@ -58,6 +62,8 @@ class QuantLinear(nn.Module, TritonModuleMixin):
else:
self.bias = None
self.trainable = trainable
def pack(self, linear, scales, zeros, g_idx=None):
W = linear.weight.data.clone()
if isinstance(linear, nn.Conv2d):
@ -122,7 +128,8 @@ class QuantLinear(nn.Module, TritonModuleMixin):
def forward(self, x):
out_shape = x.shape[:-1] + (self.outfeatures,)
out = QuantLinearFunction.apply(
quant_linear_fn = QuantLinearFunction if self.trainable else QuantLinearInferenceOnlyFunction
out = quant_linear_fn.apply(
x.reshape(-1, x.shape[-1]),
self.qweight,
self.scales,
@ -160,11 +167,14 @@ class QuantLinear(nn.Module, TritonModuleMixin):
for m in tqdm(range(0, math.ceil(math.log2(seqlen)) + 1)):
m = 2 ** m
for (k, n), (qweight, scales, qzeros, g_idx, bits, maxq) in kn_values.items():
if transpose:
a = torch.randn(m, k, dtype=torch.float16, device=model.device)
quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
if transpose:
a = torch.randn(m, n, dtype=torch.float16, device=model.device)
transpose_quant_matmul_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
else:
a = torch.randn(m, k, dtype=torch.float16, device=model.device)
quant_matmul_inference_only_248(a, qweight, scales, qzeros, g_idx, bits, maxq)
del kn_values

View file

@ -73,27 +73,7 @@ logger = getLogger(__name__)
},
num_stages=2,
num_warps=8
),
triton.Config(
{
'BLOCK_SIZE_M': 64,
'BLOCK_SIZE_N': 64,
'BLOCK_SIZE_K': 64,
'GROUP_SIZE_M': 8
},
num_stages=3,
num_warps=8
),
triton.Config(
{
'BLOCK_SIZE_M': 32,
'BLOCK_SIZE_N': 32,
'BLOCK_SIZE_K': 128,
'GROUP_SIZE_M': 8
},
num_stages=2,
num_warps=4
),
)
],
key=['M', 'N', 'K'],
nearest_power_of_two=True,
@ -244,27 +224,7 @@ def quant_matmul_248_kernel(
},
num_stages=2,
num_warps=8
),
triton.Config(
{
'BLOCK_SIZE_M': 64,
'BLOCK_SIZE_N': 64,
'BLOCK_SIZE_K': 64,
'GROUP_SIZE_M': 8
},
num_stages=3,
num_warps=8
),
triton.Config(
{
'BLOCK_SIZE_M': 32,
'BLOCK_SIZE_N': 128,
'BLOCK_SIZE_K': 32,
'GROUP_SIZE_M': 8
},
num_stages=2,
num_warps=4
),
)
],
key=['M', 'N', 'K'],
nearest_power_of_two=True
@ -356,7 +316,6 @@ def silu(x):
return x * tl.sigmoid(x)
def quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
with torch.cuda.device(input.device):
output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=input.dtype)
@ -414,3 +373,30 @@ class QuantLinearFunction(torch.autograd.Function):
if ctx.needs_input_grad[0]:
grad_input = transpose_quant_matmul_248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq)
return grad_input, None, None, None, None, None, None
def quant_matmul_inference_only_248(input, qweight, scales, qzeros, g_idx, bits, maxq):
with torch.cuda.device(input.device):
output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16)
grid = lambda META: (
triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(qweight.shape[1], META['BLOCK_SIZE_N']),
)
quant_matmul_248_kernel[grid](
input, qweight, output,
scales, qzeros, g_idx,
input.shape[0], qweight.shape[1], input.shape[1],
bits, maxq,
input.stride(0), input.stride(1),
qweight.stride(0), qweight.stride(1),
output.stride(0), output.stride(1),
scales.stride(0), qzeros.stride(0)
)
return output
class QuantLinearInferenceOnlyFunction(torch.autograd.Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float16)
def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
output = quant_matmul_248(input, qweight, scales, qzeros, g_idx, bits, maxq)
return output

View file

@ -0,0 +1 @@
from .perplexity_utils import Perplexity

View file

@ -7,15 +7,22 @@ try:
except ImportError:
TRITON_AVAILABLE = False
try:
import autogptq_cuda
AUTOGPTQ_CUDA_AVAILABLE = True
except:
AUTOGPTQ_CUDA_AVAILABLE = False
def dynamically_import_QuantLinear(use_triton: bool, desc_act: bool, group_size: int):
if use_triton:
from ..nn_modules.qlinear_triton import QuantLinear
from ..nn_modules.qlinear.qlinear_triton import QuantLinear
else:
if not desc_act or group_size == -1:
from ..nn_modules.qlinear_old import QuantLinear
from ..nn_modules.qlinear.qlinear_cuda_old import QuantLinear
else:
from ..nn_modules.qlinear import QuantLinear
from ..nn_modules.qlinear.qlinear_cuda import QuantLinear
return QuantLinear

View file

@ -0,0 +1,423 @@
import warnings
import re
from contextlib import contextmanager
from dataclasses import asdict
from enum import Enum
from typing import List, Optional
import torch
from peft import get_peft_model, PeftConfig, PeftModel, PeftType
from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING
from peft.tuners.lora import LoraConfig, LoraLayer, LoraModel, Embedding
from peft.tuners.adalora import AdaLoraConfig, AdaLoraLayer, AdaLoraModel
from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING
from peft.utils.other import _get_submodules
from ..modeling._base import BaseGPTQForCausalLM
class GPTQLoraConfig(LoraConfig):
injected_fused_attention: bool = False
injected_fused_mlp: bool = False
class GPTQLoraLinear(torch.nn.Linear, LoraLayer):
def __init__(
self,
adapter_name: str,
linear_module: torch.nn.Linear,
r: int = 0,
lora_alpha: int = 1,
lora_dropout: float = 0.0,
fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
**kwargs,
):
init_lora_weights = kwargs.pop("init_lora_weights", True)
torch.nn.Linear.__init__(self, linear_module.in_features, linear_module.out_features)
LoraLayer.__init__(self, linear_module.in_features, linear_module.out_features)
self.linear_module = linear_module
self.weight.requires_grad = False
self.weight = self.linear_module.weight
self.bias = self.linear_module.bias
self.fan_in_fan_out = fan_in_fan_out
if fan_in_fan_out:
self.weight.data = self.weight.data.T
self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
self.active_adapter = adapter_name
def reset_lora_parameters(self, adapter_name):
if adapter_name in self.lora_A.keys():
torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight)
torch.nn.init.zeros_(self.lora_B[adapter_name].weight)
def merge(self):
raise NotImplementedError("gptq model not support merge lora adapter")
def unmerge(self):
raise NotImplementedError("gptq model not support unmerge lora adapter")
def forward(self, x: torch.Tensor):
previous_dtype = x.dtype
if self.active_adapter not in self.lora_A.keys():
return self.linear_module(x)
if self.disable_adapters:
if self.r[self.active_adapter] > 0 and self.merged:
self.unmerge()
result = self.linear_module(x)
elif self.r[self.active_adapter] > 0 and not self.merged:
result = self.linear_module(x)
lora_B = self.lora_B[self.active_adapter]
lora_A = self.lora_A[self.active_adapter]
lora_dropout = self.lora_dropout[self.active_adapter]
scale = self.scaling[self.active_adapter]
x = x.type_as(lora_A.weight.data)
adapter_result = (lora_B(lora_A(lora_dropout(x))) * scale).type_as(result)
result += adapter_result
else:
result = self.linear_module(x)
result = result.to(previous_dtype)
return result
class GPTQLoraModel(LoraModel):
def _find_and_replace(self, adapter_name):
lora_config = self.peft_config[adapter_name]
is_target_modules_in_base_model = False
kwargs = {
"r": lora_config.r,
"lora_alpha": lora_config.lora_alpha,
"lora_dropout": lora_config.lora_dropout,
"fan_in_fan_out": lora_config.fan_in_fan_out,
"init_lora_weights": lora_config.init_lora_weights,
}
key_list = [key for key, _ in self.model.named_modules()]
for key in key_list:
if isinstance(lora_config.target_modules, str):
target_module_found = re.fullmatch(lora_config.target_modules, key)
else:
target_module_found = any(key.endswith(target_key) for target_key in lora_config.target_modules)
if target_module_found:
if not is_target_modules_in_base_model:
is_target_modules_in_base_model = True
parent, target, target_name = _get_submodules(self.model, key)
bias = False
if hasattr(target, "bias"):
bias = target.bias is not None
if isinstance(target, LoraLayer):
target.update_layer(
adapter_name,
lora_config.r,
lora_config.lora_alpha,
lora_config.lora_dropout,
lora_config.init_lora_weights,
)
else:
if isinstance(target, torch.nn.Embedding):
embedding_kwargs = kwargs.copy()
embedding_kwargs.pop("fan_in_fan_out", None)
in_features, out_features = target.num_embeddings, target.embedding_dim
new_module = Embedding(adapter_name, in_features, out_features, **embedding_kwargs)
else:
if isinstance(target, torch.nn.Linear):
if kwargs["fan_in_fan_out"]:
warnings.warn(
"fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
"Setting fan_in_fan_out to False."
)
kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
else:
raise ValueError(
f"Target module {target} is not supported. "
f"Currently, only `torch.nn.Linear` and its subclasses are supported."
)
new_module = GPTQLoraLinear(adapter_name, target, **kwargs)
self._replace_module(parent, target_name, new_module, target)
if not is_target_modules_in_base_model:
raise ValueError(
f"Target modules {lora_config.target_modules} not found in the base model. "
f"Please check the target modules and try again."
)
def _replace_module(self, parent_module, child_name, new_module, old_module):
setattr(parent_module, child_name, new_module)
if not isinstance(new_module, GPTQLoraLinear):
new_module.weight = old_module.weight
if hasattr(old_module, "bias"):
if old_module.bias is not None:
new_module.bias = old_module.bias
if getattr(old_module, "state", None) is not None:
new_module.state = old_module.state
new_module.to(old_module.weight.device)
# dispatch to correct device
for name, module in new_module.named_modules():
if "lora_" in name:
module.to(old_module.weight.device)
def merge_adapter(self):
raise NotImplementedError("gptq model not support merge ada lora adapter")
def unmerge_adapter(self):
raise NotImplementedError("gptq model not support unmerge ada lora adapter")
def merge_and_unload(self):
raise NotImplementedError("gptq model not support merge and unload")
class GPTQAdaLoraConfig(AdaLoraConfig):
injected_fused_attention: bool = False
injected_fused_mlp: bool = False
class GPTQSVDLinear(torch.nn.Linear, AdaLoraLayer):
def __init__(
self,
adapter_name: str,
linear_module: torch.nn.Linear,
r: int = 0,
lora_alpha: int = 1,
lora_dropout: float = 0.0,
fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
**kwargs,
):
init_lora_weights = kwargs.pop("init_lora_weights", True)
torch.nn.Linear.__init__(self, linear_module.in_features, linear_module.out_features)
AdaLoraLayer.__init__(self, linear_module.in_features, linear_module.out_features)
self.linear_module = linear_module
self.weight.requires_grad = False
self.weight = self.linear_module.weight
self.bias = self.linear_module.bias
self.fan_in_fan_out = fan_in_fan_out
if fan_in_fan_out:
self.weight.data = self.weight.data.T
self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
self.active_adapter = adapter_name
def merge(self):
raise NotImplementedError("gptq model not support merge lora adapter")
def unmerge(self):
raise NotImplementedError("gptq model not support unmerge lora adapter")
def forward(self, x: torch.Tensor):
if self.active_adapter not in self.lora_A.keys():
return self.linear_module(x)
if self.disable_adapters:
if self.r[self.active_adapter] > 0 and self.merged:
self.unmerge()
result = self.linear_module(x)
elif self.r[self.active_adapter] > 0 and not self.merged:
result = self.linear_module(x)
result += (
(
self.lora_dropout[self.active_adapter](x)
@ (self.lora_A[self.active_adapter] * self.lora_E[self.active_adapter]).T
@ self.lora_B[self.active_adapter].T
)
* self.scaling[self.active_adapter]
/ (self.ranknum[self.active_adapter] + 1e-5)
)
else:
result = self.linear_module(x)
return result
class GPTQAdaLoraModel(AdaLoraModel):
def _find_and_replace(self, adapter_name):
lora_config = self.peft_config[adapter_name]
is_target_modules_in_base_model = False
kwargs = {
"r": lora_config.init_r,
"lora_alpha": lora_config.lora_alpha,
"lora_dropout": lora_config.lora_dropout,
"fan_in_fan_out": lora_config.fan_in_fan_out,
"init_lora_weights": lora_config.init_lora_weights,
}
key_list = [key for key, _ in self.model.named_modules()]
for key in key_list:
if isinstance(lora_config.target_modules, str):
target_module_found = re.fullmatch(lora_config.target_modules, key)
else:
target_module_found = any(key.endswith(target_key) for target_key in lora_config.target_modules)
if target_module_found:
if not is_target_modules_in_base_model:
is_target_modules_in_base_model = True
parent, target, target_name = _get_submodules(self.model, key)
bias = target.bias is not None
if isinstance(target, LoraLayer):
target.update_layer(
adapter_name,
lora_config.init_r,
lora_config.lora_alpha,
lora_config.lora_dropout,
lora_config.init_lora_weights,
)
else:
if isinstance(target, torch.nn.Linear):
in_features, out_features = target.in_features, target.out_features
if kwargs["fan_in_fan_out"]:
warnings.warn(
"fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
"Setting fan_in_fan_out to False."
)
kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
else:
raise ValueError(
f"Target module {target} is not supported. "
f"Currently, only `torch.nn.Linear` and its subclasses are supported."
)
new_module = GPTQSVDLinear(adapter_name, target, **kwargs)
self._replace_module(parent, target_name, new_module, target)
if not is_target_modules_in_base_model:
raise ValueError(
f"Target modules {lora_config.target_modules} not found in the base model. "
f"Please check the target modules and try again."
)
def _replace_module(self, parent_module, child_name, new_module, old_module):
setattr(parent_module, child_name, new_module)
# dispatch to correct device
for name, module in new_module.named_modules():
if "lora_" in name:
module.to(old_module.weight.device)
def merge_adapter(self):
raise NotImplementedError("gptq model not support merge ada lora adapter")
def unmerge_adapter(self):
raise NotImplementedError("gptq model not support unmerge ada lora adapter")
def merge_and_unload(self):
raise NotImplementedError("gptq model not support merge and unload")
def find_all_linear_names(model: BaseGPTQForCausalLM, ignore: Optional[List[str]] = None, ignore_lm_head: bool = True):
if not ignore:
ignore = []
lm_head_name = model.lm_head_name
if ignore_lm_head and lm_head_name not in ignore:
ignore.append(lm_head_name)
results = set()
for n, m in model.named_modules():
if isinstance(m, torch.nn.Linear):
res = n.split('.')[-1]
if res not in ignore:
results.add(res)
return list(results)
@contextmanager
def hijack_peft_mappings():
PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
try:
yield
except:
PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
raise
finally:
PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.LORA] = GPTQLoraConfig
PEFT_TYPE_TO_MODEL_MAPPING[PeftType.LORA] = GPTQLoraModel
PEFT_TYPE_TO_CONFIG_MAPPING[PeftType.ADALORA] = GPTQAdaLoraConfig
PEFT_TYPE_TO_MODEL_MAPPING[PeftType.ADALORA] = GPTQAdaLoraModel
def get_gptq_peft_model(
model: BaseGPTQForCausalLM,
peft_config: PeftConfig = None,
model_id: str = None,
adapter_name: str = "default",
auto_find_all_linears: bool = True,
train_mode: bool = False
):
if train_mode and not model.trainable:
model.enable_trainable_mode()
if train_mode and not peft_config:
raise ValueError("peft_config not specified when in train mode.")
if not train_mode and not model_id:
raise ValueError("model_id(where to load adapters) not specified when in inference mode.")
if model.fused_attn_module_type is not None and not model.injected_fused_attention:
peft_types = [PeftType.LORA.value, PeftType.ADALORA.value]
warnings.warn(
f"You can just ignore this warning if the peft type you use isn't in {peft_types}.\n"
f"{model.__class__.__name__} supports injecting fused attention but not enables this time. "
"If you are training adapters, you must also disable fused attention injection when loading quantized "
"base model at inference time, otherwise adapters may not be added to base model properly. "
"If you are loading adapters to do inference, you can reference to adapter's config file to check "
"whether the adapters are trained using base model that not enable fused attention injection."
)
if model.injected_fused_mlp:
raise NotImplementedError("GPTQ model that enables fused mlp injection is not supported to integrate with peft.")
if train_mode:
peft_type = peft_config.peft_type
if not isinstance(peft_type, str):
peft_type = peft_type.value
if peft_type in [PeftType.LORA.value, PeftType.ADALORA.value]:
if auto_find_all_linears:
peft_config.target_modules = find_all_linear_names(model, ignore_lm_head=True)
if peft_type == PeftType.LORA.value and not isinstance(peft_config, GPTQLoraConfig):
peft_config = GPTQLoraConfig(**peft_config.to_dict())
if peft_type == PeftType.ADALORA.value and not isinstance(peft_config, GPTQAdaLoraConfig):
peft_config = GPTQAdaLoraConfig(**peft_config.to_dict())
peft_config.injected_fused_attention = model.injected_fused_attention
peft_config.injected_fused_mlp = model.injected_fused_mlp
if peft_type == PeftType.ADAPTION_PROMPT.value:
if peft_config.adapter_layers > model.config.num_hidden_layers:
warnings.warn(
f"model has only {model.config.num_hidden_layers} layers "
f"but adapter_layers is set to {peft_config.adapter_layers}, "
f"will reset value to {model.config.num_hidden_layers}."
)
peft_config.adapter_layers = model.config.num_hidden_layers
if model.injected_fused_attention:
raise NotImplementedError(
"model with fused attention injected isn't supported to use ADAPTION_PROMPT peft type yet."
)
with hijack_peft_mappings():
try:
if train_mode:
peft_model = get_peft_model(model.model, peft_config)
else:
peft_model = PeftModel.from_pretrained(model.model, model_id, adapter_name)
except:
raise NotImplementedError(
f"{model.__class__.__name__} not support {peft_config.peft_type.value} peft type yet."
)
return peft_model
__all__ = [
"GPTQLoraConfig",
"GPTQLoraModel",
"GPTQAdaLoraConfig",
"GPTQAdaLoraModel",
"find_all_linear_names",
"get_gptq_peft_model"
]

View file

@ -0,0 +1,215 @@
import sys
import torch
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
class Perplexity:
"""
A class for calculating the perplexity of a language model.
"""
def __init__(self, model, tokenizer, dataset_path='wikitext', dataset_name=None, split='test', text_column='text'):
"""
Calculate perplexity using the same method as seen in llama.cpp.
Parameters
----------
model : AutoModelForCausalLM
The language model for which the perplexity is calculated.
tokenizer : AutoTokenizer
The tokenizer corresponding to the model.
device : str, optional
The device to run the calculations on. If auto, the device that your model uses
will be the device used for these calculations. Default is 'auto'.
dataset_path : str, optional
The path to the dataset on the Hugging Face dataset hub. Default is 'wikitext'.
dataset_name : str, optional
The name of the dataset. Default is None.
split : str, optional
The split of the dataset to use. Default is 'test'.
text_column : str, optional
The name of the column in the dataset that contains the text data. Default is 'text'.
"""
self._model = model
self._tokenizer = tokenizer
self._dataset_path = dataset_path
self._dataset_name = dataset_name
self._split = split
self._text_column = text_column
self._text = self._prepare_data()
def _get_device(self):
if torch.backends.mps.is_available():
return 'mps'
elif torch.cuda.is_available():
return 'cuda:0'
else:
return 'cpu'
def _prepare_data(self):
"""
Prepares the dataset by loading and formatting.
Returns
-------
str
The formatted dataset as a single string.
"""
if self._dataset_path == 'wikitext':
self._dataset_name = 'wikitext-2-raw-v1'
# Load the dataset
data = load_dataset(self._dataset_path, self._dataset_name, split=self._split)
# Format the text column of the dataset
text_list = [' \n' if s == '' else s for s in data[self._text_column]]
return ''.join(text_list)
@staticmethod
def softmax(logits):
"""
Static method for applying the softmax function.
Parameters
----------
logits : np.ndarray
The input to the softmax function.
Returns
-------
np.ndarray
The output of the softmax function.
"""
e_x = np.exp(logits - np.max(logits))
return e_x / e_x.sum(axis=0)
def calculate_perplexity(self, n_ctx=512, n_batch=512):
"""
Calculates the perplexity of the language model.
Parameters
----------
n_ctx : int
The context size.
n_batch : int
The batch size.
Returns
-------
list
The list of perplexity scores calculated.
"""
# Tokenize the text
self._tokenizer.model_max_length = sys.maxsize
tokens = self._tokenizer(self._text, truncation=False, return_tensors='pt').input_ids.to(self._model.device)
nll = 0.0 # Negative log likelihood
count = 0 # Counter for processed tokens
curr_ppl = 0
all_perplexity = []
with tqdm(range(len(tokens[0]) // n_ctx), desc="Perplexity: - ") as progress:
for i in progress:
# Process each batch of tokens
nll, count = self._process_batch(i, n_ctx, n_batch, tokens, nll, count)
# Calculate and display the current perplexity
curr_ppl = np.exp(nll / count)
all_perplexity.append(curr_ppl)
progress.set_description(f"Perplexity: {curr_ppl:.4f}")
return all_perplexity
def _process_batch(self, i, n_ctx, n_batch, tokens, nll, count):
"""
Processes each batch of tokens.
Parameters
----------
i : int
The batch index.
n_ctx : int
The context size.
n_batch : int
The batch size.
tokens : torch.Tensor
The tokenized text.
nll : float
The current negative log likelihood.
count : int
The current count of processed tokens.
Returns
-------
float
The updated negative log likelihood.
int
The updated count of processed tokens.
"""
start = i * n_ctx
end = start + n_ctx
num_batches = (n_ctx + n_batch - 1) // n_batch
logits = []
for j in range(num_batches):
batch_start = start + j * n_batch
batch_size = min(end - batch_start, n_batch)
token_org = tokens[0][batch_start].item()
if j == 0:
# Replace the first token with the BOS token
tokens[0][batch_start] = self._tokenizer.bos_token_id
# Compute the logits for the current batch of tokens
batch_logits = self._compute_batch_logits(tokens, batch_start, batch_size)
tokens[0][batch_start] = token_org
logits.append(batch_logits)
# We rely on the fact that attention in the forward pass only looks at previous
# tokens here, so the logits returned for each token are an accurate representation
# of what the model would have predicted at that point.
#
# Example, we have a context window of 512, we will compute perplexity for each of the
# last 256 tokens. Then, we split the input up into context window size chunks to
# process the entire prompt.
for j in range(min(512, n_ctx // 2), n_ctx - 1):
tok_logits = logits[0][0][j].cpu().numpy()
# Compute the probability of the next token
prob = self.softmax(tok_logits)[tokens[0][start + j + 1]]
# Update the negative log likelihood and the count of processed tokens
nll += -np.log(prob, where=prob>0)
count += 1
return nll, count
def _compute_batch_logits(self, tokens, batch_start, batch_size):
"""
Computes the logits for a batch of tokens.
Parameters
----------
tokens : torch.Tensor
The tokenized text.
batch_start : int
The start index of the batch.
batch_size : int
The size of the batch.
Returns
-------
torch.Tensor
The logits for the batch of tokens.
"""
# Compute the logits without keeping track of gradients
with torch.no_grad():
outputs = self._model(tokens[:, batch_start:batch_start+batch_size])
return outputs.logits.detach()

View file

@ -172,16 +172,16 @@ void vecquant4matmul_faster_old(
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant2matmul_old", &vecquant2matmul_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
m.def("vecquant3matmul_old", &vecquant3matmul_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
m.def("vecquant4matmul_old", &vecquant4matmul_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
m.def("vecquant8matmul_old", &vecquant8matmul_old, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA), faster version");
m.def("vecquant3matmul_faster_old", &vecquant3matmul_faster_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA), faster version");
m.def("vecquant4matmul_faster_old", &vecquant4matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
}

View file

@ -0,0 +1,187 @@
#include <torch/all.h>
#include <torch/python.h>
#include <c10/cuda/CUDAGuard.h>
void vecquant2matmul_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
torch::Tensor g_idx
);
void vecquant2matmul(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
torch::Tensor g_idx
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant2matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
}
void vecquant3matmul_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
torch::Tensor g_idx
);
void vecquant3matmul(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
torch::Tensor g_idx
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant3matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
}
void vecquant4matmul_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
torch::Tensor g_idx
);
void vecquant4matmul(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
torch::Tensor g_idx
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant4matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
}
void vecquant8matmul_cuda(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
torch::Tensor g_idx
);
void vecquant8matmul(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
torch::Tensor g_idx
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
}
// old
void vecquant2matmul_cuda_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize
);
void vecquant2matmul_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant2matmul_cuda_old(vec, mat, mul, scales, zeros,groupsize);
}
void vecquant3matmul_cuda_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize
);
void vecquant3matmul_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant3matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
}
void vecquant4matmul_cuda_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize
);
void vecquant4matmul_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant4matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
}
void vecquant8matmul_cuda_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize
);
void vecquant8matmul_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant8matmul_cuda_old(vec, mat, mul, scales, zeros, groupsize);
}
void vecquant2matmul_faster_cuda_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize, int vec_height
);
void vecquant2matmul_faster_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize, int vec_height
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant2matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
}
void vecquant3matmul_faster_cuda_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize, int vec_height
);
void vecquant3matmul_faster_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize, int vec_height
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant3matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
}
void vecquant4matmul_faster_cuda_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize, int vec_height
);
void vecquant4matmul_faster_old(
torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor scales, torch::Tensor zeros,
int groupsize, int vec_height
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
vecquant4matmul_faster_cuda_old(vec, mat, mul, scales, zeros, groupsize, vec_height);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("vecquant2matmul", &vecquant2matmul, "Vector 2-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant3matmul", &vecquant3matmul, "Vector 3-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant4matmul", &vecquant4matmul, "Vector 4-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
m.def("vecquant2matmul_old", &vecquant2matmul_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA)");
m.def("vecquant3matmul_old", &vecquant3matmul_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA)");
m.def("vecquant4matmul_old", &vecquant4matmul_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA)");
m.def("vecquant8matmul_old", &vecquant8matmul_old, "Vector 8-bit Quantized Matrix Multiplication (CUDA)");
m.def("vecquant2matmul_faster_old", &vecquant2matmul_faster_old, "Vector 2-bit Quantized Matrix Multiplication (CUDA), faster version");
m.def("vecquant3matmul_faster_old", &vecquant3matmul_faster_old, "Vector 3-bit Quantized Matrix Multiplication (CUDA), faster version");
m.def("vecquant4matmul_faster_old", &vecquant4matmul_faster_old, "Vector 4-bit Quantized Matrix Multiplication (CUDA), faster version");
}

View file

@ -7,29 +7,66 @@
// atomicAdd for double-precision floating-point numbers on hardware with
// compute capability < 6.0 from:
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
__device__ double atomicAdd(
double* address,
double val
) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
// __device__ double atomicAdd(
// double* address,
// double val
// ) {
// unsigned long long int* address_as_ull = (unsigned long long int*)address;
// unsigned long long int old = *address_as_ull, assumed;
//
// do {
// assumed = old;
// old = atomicCAS(
// address_as_ull,
// assumed,
// __double_as_longlong(val + __longlong_as_double(assumed))
// );
//
// // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
// } while (assumed != old);
//
// return __longlong_as_double(old);
// }
// #endif
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
// adapted from https://github.com/torch/cutorch/blob/master/lib/THC/THCAtomics.cuh
__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));
unsigned int old = *address_as_ui;
unsigned int assumed;
do {
assumed = old;
old = atomicCAS(
address_as_ull,
assumed,
__double_as_longlong(val + __longlong_as_double(assumed))
);
unsigned short hsum = reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
hsum += val;
old = reinterpret_cast<size_t>(address) & 2
? (old & 0xffff) | (hsum << 16)
: (old & 0xffff0000) | hsum;
old = atomicCAS(address_as_ui, assumed, old);
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
}
__device__ __forceinline__ void atomicAdd(__half* address, c10::Half val) {
unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
unsigned int old = *address_as_ui;
unsigned int assumed;
return __longlong_as_double(old);
do {
assumed = old;
__half_raw hsum;
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
half tmpres = __hadd(hsum, val);
hsum = __half_raw(tmpres);
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
old = atomicCAS(address_as_ui, assumed, old);
} while (assumed != old);
}
#endif
template <typename scalar_t>
__global__ void VecQuant2MatMulKernel(
const scalar_t* __restrict__ vec,

File diff suppressed because it is too large Load diff

View file

@ -1,4 +1,6 @@
## <center>News or Update</center>
- 2023-07-26 - (Update) - An elegant [PPL benchmark script](examples/benchmark/perplexity.py) to get results that can be fairly compared with other libraries such as `llama.cpp`.
- 2023-06-05 - (Update) - Integrate with 🤗 peft to use gptq quantized model to train adapters, support LoRA, AdaLoRA, AdaptionPrompt, etc.
- 2023-05-30 - (Update) - support download/upload quantized model from/to 🤗 Hub.
- 2023-05-27 - (Update) - Support quantization and inference for `gpt_bigcode`, `codegen` and `RefineWeb/RefineWebModel`(falcon) model types.
- 2023-05-04 - (Update) - Support using faster cuda kernel when `not desc_act or group_size == -1`

View file

@ -13,9 +13,9 @@ python basic_usage.py
This script also showcases how to download/upload quantized model from/to 🤗 Hub, to enable those features, you can uncomment the commented codes.
To Execute `basic_usage_with_wikitext2.py`, using command like this:
To Execute `basic_usage_wikitext2.py`, using command like this:
```shell
python basic_usage_with_wikitext2.py
python basic_usage_wikitext2.py
```
> Note: There is about 0.6 ppl degrade on opt-125m model using AutoGPTQ, compared to GPTQ-for-LLaMa.
@ -66,11 +66,48 @@ Use `--help` flag to see detailed descriptions for more command arguments.
> Commands in this chapter should be run under `benchmark` folder.
### Generation Speed
`generation_speed.py` scripts gives an example of how to benchmark the generations speed of pretrained and quantized models that `auto_gptq` supports, this benchmarks model generation speed in tokens/s metric.
`generation_speed.py` script gives an example of how to benchmark the generations speed of pretrained and quantized models that `auto_gptq` supports, this benchmarks model generation speed in tokens/s metric.
To eexcute this script, using command like this:
To execute this script, using command like this:
```shell
CUDA_VISIBLE_DEVICES=0 python generation_speed.py --model_name_pr_path PATH/TO/MODEL/DIR
```
Use `--help` flag to see detailed descriptions for more command arguments.
## PEFT
> Commands in this chapter should be run under `peft` folder.
### Lora
`peft_lora_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's lora adapter using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
To execute this script, using command like this:
```shell
CUDA_VISIBLE_DEVICES=0 python peft_lora_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
```
Use `--help` flag to see detailed descriptions for more command arguments.
### AdaLora
`peft_adalora_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's adalora adapter using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
To execute this script, using command like this:
```shell
CUDA_VISIBLE_DEVICES=0 python peft_adalora_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
```
Use `--help` flag to see detailed descriptions for more command arguments.
### AdaptionPrompt
`peft_adaption_prompt_clm_instruction_tuning.py` script gives an example of instruction tuning gptq quantized model's adaption_prompt adapter(llama-adapter) using tools in `auto_gptq.utils.peft_utils` and `🤗 peft` on alpaca dataset.
To execute this script, using command like this:
```shell
CUDA_VISIBLE_DEVICES=0 python peft_adaption_prompt_clm_instruction_tuning.py --model_name_or_path PATH/TO/MODEL/DIR
```
Use `--help` flag to see detailed descriptions for more command arguments.
If you want to try models other than llama, you can install peft from source using [this branch](https://github.com/PanQiWei/peft/tree/multi_modal_adaption_prompt), see [here](https://github.com/PanQiWei/peft/blob/a5f8f74f07591efe5eb3d08cb1b31b981e84a069/src/peft/tuners/adaption_prompt.py#L235)
to check what other models are also supported, and with this branch installed, you can also use `ADAPTION_PROMPT_V2` peft type (llama-adapter-v2) by simply replace `AdaptionPromptConfig` with `AdaptionPromptV2Config` in the script.

File diff suppressed because it is too large Load diff

View file

@ -144,7 +144,9 @@ def load_model_tokenizer(
trust_remote_code: bool = False,
use_triton: bool = False,
use_safetensors: bool = False,
use_fast_tokenizer: bool = False
use_fast_tokenizer: bool = False,
inject_fused_attention: bool = True,
inject_fused_mlp: bool = True
):
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=tokenizer_name_or_path or model_name_or_path,
@ -163,12 +165,12 @@ def load_model_tokenizer(
)
else:
model = AutoGPTQForCausalLM.from_quantized(
save_dir=model_name_or_path,
model_name_or_path,
max_memory=max_memory,
low_cpu_mem_usage=True,
use_triton=use_triton,
inject_fused_attention=True,
inject_fused_mlp=True,
inject_fused_attention=inject_fused_attention,
inject_fused_mlp=inject_fused_mlp,
use_cuda_fp16=True,
quantize_config=quantize_config,
model_basename=model_basename,
@ -232,6 +234,8 @@ def main():
parser.add_argument("--use_triton", action="store_true")
parser.add_argument("--use_safetensors", action="store_true")
parser.add_argument("--use_fast_tokenizer", action="store_true")
parser.add_argument("--no_inject_fused_attention", action="store_true")
parser.add_argument("--no_inject_fused_mlp", action="store_true")
parser.add_argument("--num_samples", type=int, default=10)
parser.add_argument("--per_gpu_max_memory", type=int, default=None)
parser.add_argument("--cpu_max_memory", type=int, default=None)
@ -269,7 +273,9 @@ def main():
trust_remote_code=args.trust_remote_code,
use_triton=args.use_triton,
use_safetensors=args.use_safetensors,
use_fast_tokenizer=args.use_fast_tokenizer
use_fast_tokenizer=args.use_fast_tokenizer,
inject_fused_attention=not args.no_inject_fused_attention,
inject_fused_mlp=not args.no_inject_fused_mlp
)
end = time.time()
logger.info(f"model and tokenizer loading time: {end - start:.4f}s")
@ -282,7 +288,9 @@ def main():
model.warmup_triton()
logger.info("loading data")
examples = load_data("dataset/alpaca_data_cleaned.json", tokenizer, args.num_samples, args.max_new_tokens)
examples = load_data(
"../quantization/dataset/alpaca_data_cleaned.json", tokenizer, args.num_samples, args.max_new_tokens
)
generation_config = GenerationConfig(
num_beams=args.num_beams,

View file

@ -0,0 +1,86 @@
import os
import argparse
import torch
from auto_gptq.utils import Perplexity
from transformers import AutoTokenizer
if __name__ == "__main__":
"""
Example usage.
Default usage with GPT2 model:
python examples/benchmark/perplexity.py
Specify GPTQ quantized model:
python examples/benchmark/perplexity.py \
--model_name TheBloke/open-llama-7b-open-instruct-GPTQ \
--model_basename gptq_model-4bit-128g \
--is_quantized
Change your dataset:
python examples/benchmark/perplexity.py --dataset_path tiny_shakespeare
"""
parser = argparse.ArgumentParser(description="Calculate Perplexity for a model.")
parser.add_argument("--model_name", type=str, default='gpt2', help="Model name.")
parser.add_argument("--model_basename", type=str, default=None, help="Model file's basename.")
parser.add_argument("--n_ctx", type=int, default=512, help="Context size.")
parser.add_argument("--n_batch", type=int, default=512, help="Batch size.")
parser.add_argument("--dataset_path", type=str, default='wikitext', help="Path to the dataset.")
parser.add_argument("--dataset_name", type=str, default=None, help="Name of the dataset.")
parser.add_argument("--split", type=str, default='test', help="Dataset split to use.")
parser.add_argument("--text_column", type=str, default='text', help="Column in the dataset containing the text.")
parser.add_argument("--per_gpu_max_memory", type=int, default=None, help="Max memory used in each GPU.")
parser.add_argument("--cpu_max_memory", type=int, default=None, help="Mx memory used in CPU.")
parser.add_argument("--is_quantized", action="store_true", help="Is the model GPTQ quantized?")
parser.add_argument("--use_safetensors", action="store_true", help="Whether to use safetensors model file")
parser.add_argument("--use_fast_tokenizer", action="store_true", help="Wheter to use fast tokenizer")
parser.add_argument("--trust_remote_code", action="store_true", help="Whether to use remote code")
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=args.use_fast_tokenizer)
if not tokenizer.pad_token_id:
tokenizer.pad_token_id = tokenizer.eos_token_id
max_memory = dict()
if args.per_gpu_max_memory is not None and args.per_gpu_max_memory > 0:
if torch.cuda.is_available():
max_memory.update(
{i: f"{args.per_gpu_max_memory}GIB" for i in range(torch.cuda.device_count())}
)
if args.cpu_max_memory is not None and args.cpu_max_memory > 0 and max_memory:
max_memory["cpu"] = f"{args.cpu_max_memory}GIB"
if not max_memory:
max_memory = None
if args.is_quantized:
from auto_gptq import AutoGPTQForCausalLM
model = AutoGPTQForCausalLM.from_quantized(
args.model_name,
low_cpu_mem_usage=True,
device_map="auto",
max_memory=max_memory,
model_basename=args.model_basename,
use_safetensors=args.use_safetensors,
trust_remote_code=args.trust_remote_code,
inject_fused_mlp=False,
inject_fused_attention=False
)
else:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
args.model_name,
low_cpu_mem_usage=True,
device_map="auto",
max_memory=max_memory,
torch_dtype=torch.float16,
trust_remote_code=args.trust_remote_code
)
ppl = Perplexity(model, tokenizer, args.dataset_path, args.dataset_name, args.split, args.text_column)
ppl.calculate_perplexity(args.n_ctx, args.n_batch)

View file

@ -0,0 +1,169 @@
import json
import os
from argparse import ArgumentParser
from functools import partial
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
from auto_gptq.utils.data_utils import make_data_block, collate_data
from auto_gptq.utils.peft_utils import GPTQAdaLoraConfig
from peft import TaskType
parser = ArgumentParser()
parser.add_argument("--model_name_or_path", type=str)
parser.add_argument("--lr", type=float, default=3e-3)
parser.add_argument("--num_epochs", type=int, default=1)
parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
parser.add_argument("--block_max_length", type=int, default=1024, help="max length of data block(bunch of samples)")
parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
parser.add_argument("--use_fast_tokenizer", action="store_true")
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
model_name_or_path = args.model_name_or_path
tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
lr = args.lr
num_epochs = args.num_epochs
# creating model
peft_config = GPTQAdaLoraConfig(
init_r=20,
target_r=16,
beta1=0.85,
beta2=0.85,
tinit=200,
tfinal=1000,
deltaT=10,
lora_alpha=32,
lora_dropout=0.1,
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
if not tokenizer.pad_token_id:
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoGPTQForCausalLM.from_quantized(
model_name_or_path,
use_triton=True,
warmup_triton=False,
trainable=True,
inject_fused_attention=True,
inject_fused_mlp=False
)
model.warmup_triton()
device = model.device
model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
model.print_trainable_parameters()
# loading dataset
WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
def ds_refactor_fn(samples):
instruction_data = samples["instruction"]
input_data = samples["input"]
output_data = samples["output"]
new_samples = {"prompt": [], "output": []}
for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
if input_txt:
prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
else:
prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
new_samples["prompt"].append(prompt)
new_samples["output"].append(output_txt)
return new_samples
ds = Dataset.from_generator(
lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
)
ds = ds.map(
make_data_block,
batched=True,
batch_size=len(ds),
num_proc=1,
remove_columns=ds.column_names,
keep_in_memory=True,
load_from_cache_file=False,
fn_kwargs={
"prompt_col_name": "prompt",
"label_col_name": "output",
"tokenizer": tokenizer,
"preprocess_fn": ds_refactor_fn,
"sample_max_len": args.sample_max_length,
"block_max_len": args.block_max_length,
"add_eos_token": True,
"truncate_prompt": False,
"merge_prompt_label": True
}
)
ds = ds.train_test_split(test_size=len(ds) // 10)
train_ds, eval_ds = ds["train"], ds["test"]
collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
# optimizer and lr scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=(len(train_dataloader) * num_epochs),
)
model.base_model.peft_config["default"].total_step = len(train_dataloader) * num_epochs
# training and evaluation
with torch.cuda.amp.autocast():
global_step = 0
for epoch in range(num_epochs):
model.train()
total_loss = 0
progress_bar = tqdm(train_dataloader)
for step, batch in enumerate(progress_bar):
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
total_loss += loss.detach().float()
loss.backward()
optimizer.step()
lr_scheduler.step()
# Update the importance of low-rank matrices
# and allocate the budget accordingly.
model.base_model.update_and_allocate(global_step)
optimizer.zero_grad()
global_step += 1
progress_bar.set_postfix(loss=loss.item())
model.eval()
eval_loss = 0
eval_preds = []
for step, batch in enumerate(tqdm(eval_dataloader)):
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
eval_loss += loss.detach().float()
eval_preds.extend(
tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
)
eval_epoch_loss = eval_loss / len(eval_dataloader)
eval_ppl = torch.exp(eval_epoch_loss)
train_epoch_loss = total_loss / len(train_dataloader)
train_ppl = torch.exp(train_epoch_loss)
print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))

View file

@ -0,0 +1,158 @@
import json
import os
from argparse import ArgumentParser
from functools import partial
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
from auto_gptq.utils.data_utils import make_data_block, collate_data
from peft import TaskType, AdaptionPromptConfig
parser = ArgumentParser()
parser.add_argument("--model_name_or_path", type=str)
parser.add_argument("--adapter_len", type=int, default=10)
parser.add_argument("--adapter_layers", type=int, default=30)
parser.add_argument("--lr", type=float, default=3e-3)
parser.add_argument("--num_epochs", type=int, default=1)
parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
parser.add_argument("--block_max_length", type=int, default=1024, help="max length of data block(bunch of samples)")
parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
parser.add_argument("--use_fast_tokenizer", action="store_true")
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
model_name_or_path = args.model_name_or_path
tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
lr = args.lr
num_epochs = args.num_epochs
# creating model
peft_config = AdaptionPromptConfig(
adapter_len=args.adapter_len,
adapter_layers=args.adapter_layers,
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
if not tokenizer.pad_token_id:
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoGPTQForCausalLM.from_quantized(
model_name_or_path,
use_triton=True,
warmup_triton=False,
trainable=True,
inject_fused_attention=False,
inject_fused_mlp=False
)
model.warmup_triton()
device = model.device
model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
model.print_trainable_parameters()
# loading dataset
WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
def ds_refactor_fn(samples):
instruction_data = samples["instruction"]
input_data = samples["input"]
output_data = samples["output"]
new_samples = {"prompt": [], "output": []}
for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
if input_txt:
prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
else:
prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
new_samples["prompt"].append(prompt)
new_samples["output"].append(output_txt)
return new_samples
ds = Dataset.from_generator(
lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
)
ds = ds.map(
make_data_block,
batched=True,
batch_size=len(ds),
num_proc=1,
remove_columns=ds.column_names,
keep_in_memory=True,
load_from_cache_file=False,
fn_kwargs={
"prompt_col_name": "prompt",
"label_col_name": "output",
"tokenizer": tokenizer,
"preprocess_fn": ds_refactor_fn,
"sample_max_len": args.sample_max_length,
"block_max_len": args.block_max_length,
"add_eos_token": True,
"truncate_prompt": False,
"merge_prompt_label": True
}
)
ds = ds.train_test_split(test_size=len(ds) // 10)
train_ds, eval_ds = ds["train"], ds["test"]
collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
# optimizer and lr scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=(len(train_dataloader) * num_epochs),
)
# training and evaluation
with torch.cuda.amp.autocast():
for epoch in range(num_epochs):
model.train()
total_loss = 0
progress_bar = tqdm(train_dataloader)
for step, batch in enumerate(progress_bar):
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
total_loss += loss.detach().float()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.set_postfix(loss=loss.item())
model.eval()
eval_loss = 0
eval_preds = []
for step, batch in enumerate(tqdm(eval_dataloader)):
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
eval_loss += loss.detach().float()
eval_preds.extend(
tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
)
eval_epoch_loss = eval_loss / len(eval_dataloader)
eval_ppl = torch.exp(eval_epoch_loss)
train_epoch_loss = total_loss / len(train_dataloader)
train_ppl = torch.exp(train_epoch_loss)
print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))

View file

@ -0,0 +1,158 @@
import json
import os
from argparse import ArgumentParser
from functools import partial
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model
from auto_gptq.utils.data_utils import make_data_block, collate_data
from auto_gptq.utils.peft_utils import GPTQLoraConfig
from peft import TaskType
parser = ArgumentParser()
parser.add_argument("--model_name_or_path", type=str)
parser.add_argument("--lr", type=float, default=3e-5)
parser.add_argument("--num_epochs", type=int, default=1)
parser.add_argument("--sample_max_length", type=int, default=1024, help="max length of sample")
parser.add_argument("--block_max_length", type=int, default=1024, help="max length of data block(bunch of samples)")
parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
parser.add_argument("--use_fast_tokenizer", action="store_true")
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
model_name_or_path = args.model_name_or_path
tokenizer_name_or_path = args.tokenizer_name_or_path or model_name_or_path
lr = args.lr
num_epochs = args.num_epochs
# creating model
peft_config = GPTQLoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.1,
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=args.use_fast_tokenizer)
if not tokenizer.pad_token_id:
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoGPTQForCausalLM.from_quantized(
model_name_or_path,
use_triton=True,
warmup_triton=False,
trainable=True,
inject_fused_attention=True,
inject_fused_mlp=False
)
model.warmup_triton()
device = model.device
model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
model.print_trainable_parameters()
# loading dataset
WITH_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Output:\n"
WITHOUT_INPUT_TEMPLATE = "### Instruction:\n{instruction}\n\n### Output:\n"
def ds_refactor_fn(samples):
instruction_data = samples["instruction"]
input_data = samples["input"]
output_data = samples["output"]
new_samples = {"prompt": [], "output": []}
for instruction_txt, input_txt, output_txt in zip(instruction_data, input_data, output_data):
if input_txt:
prompt = WITH_INPUT_TEMPLATE.format(instruction=instruction_txt, input=input_txt)
else:
prompt = WITHOUT_INPUT_TEMPLATE.format(instruction=instruction_txt)
new_samples["prompt"].append(prompt)
new_samples["output"].append(output_txt)
return new_samples
ds = Dataset.from_generator(
lambda: json.load(open("../quantization/dataset/alpaca_data_cleaned.json", "r", encoding="utf-8"))
)
ds = ds.map(
make_data_block,
batched=True,
batch_size=len(ds),
num_proc=1,
remove_columns=ds.column_names,
keep_in_memory=True,
load_from_cache_file=False,
fn_kwargs={
"prompt_col_name": "prompt",
"label_col_name": "output",
"tokenizer": tokenizer,
"preprocess_fn": ds_refactor_fn,
"sample_max_len": args.sample_max_length,
"block_max_len": args.block_max_length,
"add_eos_token": True,
"truncate_prompt": False,
"merge_prompt_label": True
}
)
ds = ds.train_test_split(test_size=len(ds) // 10)
train_ds, eval_ds = ds["train"], ds["test"]
collate_fn = partial(collate_data, pad_token_id=tokenizer.pad_token_id)
train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, collate_fn=partial(collate_fn))
eval_dataloader = DataLoader(eval_ds, batch_size=1, shuffle=False, collate_fn=collate_fn)
# optimizer and lr scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=(len(train_dataloader) * num_epochs),
)
# training and evaluation
with torch.cuda.amp.autocast():
for epoch in range(num_epochs):
model.train()
total_loss = 0
progress_bar = tqdm(train_dataloader)
for step, batch in enumerate(progress_bar):
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
total_loss += loss.detach().float()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.set_postfix(loss=loss.item())
model.eval()
eval_loss = 0
eval_preds = []
for step, batch in enumerate(tqdm(eval_dataloader)):
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
eval_loss += loss.detach().float()
eval_preds.extend(
tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
)
eval_epoch_loss = eval_loss / len(eval_dataloader)
eval_ppl = torch.exp(eval_epoch_loss)
train_epoch_loss = total_loss / len(train_dataloader)
train_ppl = torch.exp(train_epoch_loss)
print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
model.save_pretrained(os.path.join(model_name_or_path, f"gptq_{peft_config.peft_type.value}_adapter"))

View file

@ -4,29 +4,30 @@ import sys
from pathlib import Path
from setuptools import setup, find_packages
try:
import torch
TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False
IN_GITHUB_ACTIONS = os.environ.get("GITHUB_ACTIONS", "false") == "true"
python_min_version = (3, 8, 0)
python_min_version_str = '.'.join(map(str, python_min_version))
if sys.version_info < python_min_version:
print(f"You are using Python {platform.python_version()}. Python >={python_min_version_str} is required.")
sys.exit(-1)
CUDA_VERSION = "".join(os.environ.get("CUDA_VERSION", "").split("."))
BUILD_CUDA_EXT = int(os.environ.get('BUILD_CUDA_EXT', '1')) == 1
if BUILD_CUDA_EXT:
try:
import torch
except:
print("torch is not installed, please install torch first!")
sys.exit(-1)
CUDA_VERSION = "".join(torch.version.cuda.split("."))
else:
CUDA_VERSION = "".join(os.environ.get("CUDA_VERSION", "").split("."))
version = "0.2.1" + (f"+cu{CUDA_VERSION}" if CUDA_VERSION and IN_GITHUB_ACTIONS else "")
common_setup_kwargs = {
"version": version,
"version": "0.3.2",
"name": "auto_gptq",
"author": "PanQiWei",
"description": "An easy-to-use LLMs quantization package with user-friendly apis, based on GPTQ algorithm.",
"long_description": (Path(__file__).parent / "README.md").read_text(),
"long_description": (Path(__file__).parent / "README.md").read_text(encoding="UTF-8"),
"long_description_content_type": "text/markdown",
"url": "https://github.com/PanQiWei/AutoGPTQ",
"keywords": ["gptq", "quantization", "large-language-models", "pytorch", "transformers"],
@ -45,6 +46,9 @@ common_setup_kwargs = {
"python_requires": f">={python_min_version_str}"
}
if CUDA_VERSION:
common_setup_kwargs['version'] += f"+cu{CUDA_VERSION}"
requirements = [
"accelerate>=0.19.0",
"datasets",
@ -52,33 +56,37 @@ requirements = [
"rouge",
"torch>=1.13.0",
"safetensors",
"transformers>=4.26.1"
"transformers>=4.31.0",
"peft"
]
extras_require = {
"llama": ["transformers>=4.28.0"],
"triton": ["triton>=2.0.0"]
}
include_dirs = ["autogptq_cuda"]
if TORCH_AVAILABLE:
BUILD_CUDA_EXT = int(os.environ.get('BUILD_CUDA_EXT', '1')) == 1
additional_setup_kwargs = dict()
if BUILD_CUDA_EXT and (torch.cuda.is_available() or IN_GITHUB_ACTIONS):
additional_setup_kwargs = dict()
if BUILD_CUDA_EXT:
from torch.utils import cpp_extension
from distutils.sysconfig import get_python_lib
conda_cuda_include_dir=os.path.join(get_python_lib(),"nvidia/cuda_runtime/include")
conda_cuda_include_dir = os.path.join(get_python_lib(), "nvidia/cuda_runtime/include")
if os.path.isdir(conda_cuda_include_dir):
include_dirs.append(conda_cuda_include_dir)
print(f"appending conda cuda include dir {conda_cuda_include_dir}")
extensions = [
cpp_extension.CUDAExtension(
"autogptq_cuda",
"autogptq_cuda_64",
[
"autogptq_cuda/autogptq_cuda.cpp",
"autogptq_cuda/autogptq_cuda_kernel.cu"
"autogptq_cuda/autogptq_cuda_64.cpp",
"autogptq_cuda/autogptq_cuda_kernel_64.cu"
]
),
cpp_extension.CUDAExtension(
"autogptq_cuda_256",
[
"autogptq_cuda/autogptq_cuda_256.cpp",
"autogptq_cuda/autogptq_cuda_kernel_256.cu"
]
)
]
@ -87,19 +95,11 @@ if TORCH_AVAILABLE:
"ext_modules": extensions,
"cmdclass": {'build_ext': cpp_extension.BuildExtension}
}
common_setup_kwargs.update(additional_setup_kwargs)
setup(
common_setup_kwargs.update(additional_setup_kwargs)
setup(
packages=find_packages(),
install_requires=requirements,
extras_require=extras_require,
include_dirs=include_dirs,
**common_setup_kwargs
)
else:
setup(
packages=find_packages(),
install_requires=requirements,
extras_require=extras_require,
include_dirs=include_dirs,
**common_setup_kwargs
)
)