update basic_usage.py

2023-05-30 07:47:10 +08:00 · 2023-05-30 07:47:10 +08:00 · e826d89dbc
commit e826d89dbc
parent df8672ce75
2 changed files with 20 additions and 1 deletions
--- a/examples/README.md
+++ b/examples/README.md
@ -11,6 +11,8 @@ To Execute `basic_usage.py`, using command like this:
 python basic_usage.py
 ```

+This script also showcases how to download/upload quantized model from/to 🤗 Hub, to enable those features, you can uncomment the commented codes.
+
 To Execute `basic_usage_with_wikitext2.py`, using command like this:
 ```shell
 python basic_usage_with_wikitext2.py
--- a/examples/quantization/basic_usage.py
+++ b/examples/quantization/basic_usage.py
@ -31,11 +31,28 @@ def main():
    # save quantized model
    model.save_quantized(quantized_model_dir)

+    # push quantized model to Hugging Face Hub.
+    # to use use_auth_token=True, Login first via huggingface-cli login.
+    # or pass explcit token with: use_auth_token="hf_xxxxxxx"
+    # (uncomment the following three lines to enable this feature)
+    # repo_id = f"YourUserName/{quantized_model_dir}"
+    # commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
+    # model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
+
+    # alternatively you can save and push at the same time
+    # (uncomment the following three lines to enable this feature)
+    # repo_id = f"YourUserName/{quantized_model_dir}"
+    # commit_message = f"AutoGPTQ model for {pretrained_model_dir}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
+    # model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
+
    # save quantized model using safetensors
    model.save_quantized(quantized_model_dir, use_safetensors=True)

    # load quantized model to the first GPU
-    model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir)
+    model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")
+
+    # download quantized model from Hugging Face Hub and load to the first GPU
+    # model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)

    # inference with model.generate
    print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))