Update README.md

merge the example code of downloading from and uploading to HF Hub into simplest usage code above to keep README compact.
2023-05-30 05:49:29 +08:00 · 2023-05-30 05:49:29 +08:00 · 17db71491f
commit 17db71491f
parent b7bb50b4d5
1 changed files with 16 additions and 53 deletions
--- a/README.md
+++ b/README.md
@ -115,10 +115,26 @@ model.save_quantized(quantized_model_dir)
 # save quantized model using safetensors
 model.save_quantized(quantized_model_dir, use_safetensors=True)

+# push quantized model to Hugging Face Hub. 
+# to use use_auth_token=True, Login first via huggingface-cli login.
+# or pass explcit token with: use_auth_token="hf_xxxxxxx"
+# (uncomment the following three lines to enable this feature)
+# repo_id = f"YourUserName/{quantized_model_dir}"
+# commit_message = f"AutoGPTQ model for {pretrained_model}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
+# model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
+
+# alternatively you can save and push at the same time
+# (uncomment the following three lines to enable this feature)
+# repo_id = f"YourUserName/{quantized_model_dir}"
+# commit_message = f"AutoGPTQ model for {pretrained_model}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
+# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)

 # load quantized model to the first GPU
 model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir)

+# download quantized model from Hugging Face Hub and load to the first GPU
+# model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)
+
 # inference with model.generate
 print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to(model.device))[0]))

@ -127,59 +143,6 @@ pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer)
 print(pipeline("auto-gptq is")[0]["generated_text"])
 ```

-The following example demonstrates use of Hugging Face Hub for model downloading and uploading:
-```python
-from transformers import AutoTokenizer, TextGenerationPipeline
-from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
-import logging
-
-logging.basicConfig(
-    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
-)
-
-pretrained_model = "facebook/opt-125m"
-quantized_model_dir = "opt-125m-4bit"
-
-tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)
-examples = [
-    tokenizer(
-        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
-    )
-]
-
-quantize_config = BaseQuantizeConfig(
-    bits=4,  # quantize model to 4-bit
-    group_size=128,  # it is recommended to set the value to 128
-    desc_act=True # use desc_act for higher inference quality from quantized model
-)
-
-# Load un-quantized model. By default, the model will always be loaded into CPU memory
-model = AutoGPTQForCausalLM.from_pretrained(pretrained_model, quantize_config)
-
-# Quantize model Examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
-model.quantize(examples, use_triton=False)
-
-# save quantized model using safetensors
-model.save_quantized(quantized_model_dir, use_safetensors=True)
-
-repo_id = f"YourUserName/{quantized_model_dir}"
-
-# Push quantized model to Hugging Face Hub. 
-# To use use_auth_token=True, Login first via huggingface-cli login.
-# Or pass explcit token with: use_auth_token="hf_xxxxxxx"
-commit_message = f"AutoGPTQ model for {pretrained_model}: {quantize_config.bits}bits, gr{quantize_config.group_size}, desc_act={quantize_config.desc_act}"
-model.push_to_hub(repo_id, commit_message=commit_message, use_auth_token=True)
-
-# Alternatively you can save and push at the same time with:
-# model.push_to_hub(repo_id, save_dir=quantized_model_dir, use_safetensors=True, commit_message=commit_message, use_auth_token=True)
-
-# Load quantized model to the first GPU
-model = AutoGPTQForCausalLM.from_quantized(repo_id, device="cuda:0", use_safetensors=True, use_triton=False)
-
-# Inference with model.generate
-print(tokenizer.decode(model.generate(**tokenizer("auto_gptq is", return_tensors="pt").to("cuda:0"))[0]))
-```
-
 For more advanced features of model quantization, please reference to [this script](examples/quantization/quant_with_alpaca.py)

 ### Customize Model