55 lines
1.9 KiB
Python
55 lines
1.9 KiB
Python
import os
|
|
|
|
from transformers import AutoTokenizer, TextGenerationPipeline
|
|
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
|
|
|
|
|
|
pretrained_model_dir = "facebook/opt-125m"
|
|
quantized_model_dir = "opt-125m-4bit-128g"
|
|
|
|
|
|
def main():
|
|
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
|
|
examples = [
|
|
tokenizer(
|
|
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
|
|
)
|
|
]
|
|
|
|
quantize_config = BaseQuantizeConfig(
|
|
bits=4, # quantize model to 4-bit
|
|
group_size=128, # it is recommended to set the value to 128
|
|
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
|
|
)
|
|
|
|
# load un-quantized model, by default, the model will always be loaded into CPU memory
|
|
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
|
|
|
|
# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
|
|
model.quantize(examples)
|
|
|
|
# save quantized model
|
|
model.save_quantized(quantized_model_dir)
|
|
|
|
# save quantized model using safetensors
|
|
model.save_quantized(quantized_model_dir, use_safetensors=True)
|
|
|
|
# load quantized model to the first GPU
|
|
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir)
|
|
|
|
# inference with model.generate
|
|
print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))
|
|
|
|
# or you can also use pipeline
|
|
pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
|
|
print(pipeline("auto-gptq is")[0]["generated_text"])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import logging
|
|
|
|
logging.basicConfig(
|
|
format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
|
|
)
|
|
|
|
main()
|