Update README.md
merge the example code of downloading from and uploading to HF Hub into simplest usage code above to keep README compact.
This commit is contained in:
parent
b7bb50b4d5
commit
17db71491f
1 changed files with 16 additions and 53 deletions
69
README.md
69
README.md
|
@ -115,10 +115,26 @@ model.save_quantized(quantized_model_dir)
|
||||||
# save quantized model using safetensors
|
# save quantized model using safetensors
|
||||||
model.save_quantized(quantized_model_dir, use_safetensors=True)
|
model.save_quantized(quantized_model_dir, use_safetensors=True)
|
||||||
|
|
||||||
|
# push quantized model to Hugging Face Hub.
|
||||||
|
# to use use_auth_token=True, Login first via huggingface-cli login.
|
||||||
|
# or pass explcit token with: use_auth_token="hf_xxxxxxx"
|
||||||
|
# (uncomment the following three lines to enable this feature)
|
||||||
|
# repo_id = f"YourUserName/{quantized_model_dir}"
|
||||||
|
# commit_message = f"AutoGPTQ model for {pretrained_model}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
|
||||||
|
# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
|
||||||
|
|
||||||
|
# alternatively you can save and push at the same time
|
||||||
|
# (uncomment the following three lines to enable this feature)
|
||||||
|
# repo_id = f"YourUserName/{quantized_model_dir}"
|
||||||
|
# commit_message = f"AutoGPTQ model for {pretrained_model}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
|
||||||
|
# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
|
||||||
|
|
||||||
# load quantized model to the first GPU
|
# load quantized model to the first GPU
|
||||||
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir)
|
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir)
|
||||||
|
|
||||||
|
# download quantized model from Hugging Face Hub and load to the first GPU
|
||||||
|
# model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)
|
||||||
|
|
||||||
# inference with model.generate
|
# inference with model.generate
|
||||||
print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))
|
print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))
|
||||||
|
|
||||||
|
@ -127,59 +143,6 @@ pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
|
||||||
print(pipeline("auto-gptq is")[0]["generated_text"])
|
print(pipeline("auto-gptq is")[0]["generated_text"])
|
||||||
```
|
```
|
||||||
|
|
||||||
The following example demonstrates use of Hugging Face Hub for model downloading and uploading:
|
|
||||||
```python
|
|
||||||
from transformers import AutoTokenizer, TextGenerationPipeline
|
|
||||||
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
|
|
||||||
import logging
|
|
||||||
|
|
||||||
logging.basicConfig(
|
|
||||||
format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
|
|
||||||
)
|
|
||||||
|
|
||||||
pretrained_model = "facebook/opt-125m"
|
|
||||||
quantized_model_dir = "opt-125m-4bit"
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)
|
|
||||||
examples = [
|
|
||||||
tokenizer(
|
|
||||||
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
quantize_config = BaseQuantizeConfig(
|
|
||||||
bits=4, # quantize model to 4-bit
|
|
||||||
group_size=128, # it is recommended to set the value to 128
|
|
||||||
desc_act=True # use desc_act for higher inference quality from quantized model
|
|
||||||
)
|
|
||||||
|
|
||||||
# Load un-quantized model. By default, the model will always be loaded into CPU memory
|
|
||||||
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model, quantize_config)
|
|
||||||
|
|
||||||
# Quantize model Examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
|
|
||||||
model.quantize(examples, use_triton=False)
|
|
||||||
|
|
||||||
# save quantized model using safetensors
|
|
||||||
model.save_quantized(quantized_model_dir, use_safetensors=True)
|
|
||||||
|
|
||||||
repo_id = f"YourUserName/{quantized_model_dir}"
|
|
||||||
|
|
||||||
# Push quantized model to Hugging Face Hub.
|
|
||||||
# To use use_auth_token=True, Login first via huggingface-cli login.
|
|
||||||
# Or pass explcit token with: use_auth_token="hf_xxxxxxx"
|
|
||||||
commit_message = f"AutoGPTQ model for {pretrained_model}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
|
|
||||||
model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
|
|
||||||
|
|
||||||
# Alternatively you can save and push at the same time with:
|
|
||||||
# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
|
|
||||||
|
|
||||||
# Load quantized model to the first GPU
|
|
||||||
model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)
|
|
||||||
|
|
||||||
# Inference with model.generate
|
|
||||||
print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to("cuda:0"))[0]))
|
|
||||||
```
|
|
||||||
|
|
||||||
For more advanced features of model quantization, please reference to [this script](examples/quantization/quant_with_alpaca.py)
|
For more advanced features of model quantization, please reference to [this script](examples/quantization/quant_with_alpaca.py)
|
||||||
|
|
||||||
### Customize Model
|
### Customize Model
|
||||||
|
|
Loading…
Add table
Reference in a new issue