diff --git a/examples/README.md b/examples/README.md index 3f6b5f0..9422813 100644 --- a/examples/README.md +++ b/examples/README.md @@ -11,6 +11,8 @@ To Execute `basic_usage.py`, using command like this: python basic_usage.py ``` +This script also showcases how to download/upload quantized model from/to 🤗 Hub, to enable those features, you can uncomment the commented codes. + To Execute `basic_usage_with_wikitext2.py`, using command like this: ```shell python basic_usage_with_wikitext2.py diff --git a/examples/quantization/basic_usage.py b/examples/quantization/basic_usage.py index 99fc8e7..b8c459e 100644 --- a/examples/quantization/basic_usage.py +++ b/examples/quantization/basic_usage.py @@ -31,11 +31,28 @@ def main(): # save quantized model model.save_quantized(quantized_model_dir) + # push quantized model to Hugging Face Hub. + # to use use_auth_token=True, Login first via huggingface-cli login. + # or pass explcit token with: use_auth_token="hf_xxxxxxx" + # (uncomment the following three lines to enable this feature) + # repo_id = f"YourUserName/{quantized_model_dir}" + # commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}" + # model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True) + + # alternatively you can save and push at the same time + # (uncomment the following three lines to enable this feature) + # repo_id = f"YourUserName/{quantized_model_dir}" + # commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}" + # model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True) + # save quantized model using safetensors model.save_quantized(quantized_model_dir, use_safetensors=True) # load quantized model to the first GPU - model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir) + model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0") + + # download quantized model from Hugging Face Hub and load to the first GPU + # model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False) # inference with model.generate print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))