import os from transformers import AutoTokenizer, TextGenerationPipeline from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig import numpy as np import torch import torch.nn as nn pretrained_model_dir = "facebook/opt-125m" quantized_model_dir = "opt-125m-4bit" # os.makedirs(quantized_model_dir, exist_ok=True) def get_wikitext2(nsamples, seed, seqlen, model): from datasets import load_dataset traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train') testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') from transformers import AutoTokenizer try: tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) except: tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True) trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt') testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt') import random random.seed(seed) np.random.seed(0) torch.random.manual_seed(0) trainloader = [] for _ in range(nsamples): i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) j = i + seqlen inp = trainenc.input_ids[:, i:j] trainloader.append({'input_ids':inp}) return trainloader, testenc @torch.no_grad() def opt_eval(model, testenc, dev, seqlen = 2048): print('Evaluating ...') testenc = testenc.input_ids nsamples = testenc.numel() // seqlen use_cache = model.config.use_cache model.config.use_cache = False layers = model.model.decoder.layers model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: model.model.decoder.project_out = model.model.decoder.project_out.to(dev) if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: model.model.decoder.project_in = model.model.decoder.project_in.to(dev) layers[0] = layers[0].to(dev) dtype = next(iter(model.parameters())).dtype inps = torch.zeros((nsamples, seqlen, model.config.hidden_size), dtype=dtype, device=dev) cache = {'i': 0, 'attention_mask': None} class Catcher(nn.Module): def __init__(self, module): super().__init__() self.module = module def forward(self, inp, **kwargs): inps[cache['i']] = inp cache['i'] += 1 cache['attention_mask'] = kwargs['attention_mask'] raise ValueError layers[0] = Catcher(layers[0]) for i in range(nsamples): batch = testenc[:, (i * seqlen):((i + 1) * seqlen)].to(dev) try: model(batch) except ValueError: pass layers[0] = layers[0].module layers[0] = layers[0].cpu() model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: model.model.decoder.project_out = model.model.decoder.project_out.cpu() if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: model.model.decoder.project_in = model.model.decoder.project_in.cpu() torch.cuda.empty_cache() outs = torch.zeros_like(inps) attention_mask = cache['attention_mask'] for i in range(len(layers)): print(i) layer = layers[i].to(dev) for j in range(nsamples): outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] layers[i] = layer.cpu() del layer torch.cuda.empty_cache() inps, outs = outs, inps if model.model.decoder.final_layer_norm is not None: model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev) if model.model.decoder.project_out is not None: model.model.decoder.project_out = model.model.decoder.project_out.to(dev) model.lm_head = model.lm_head.to(dev) testenc = testenc.to(dev) nlls = [] for i in range(nsamples): hidden_states = inps[i].unsqueeze(0) if model.model.decoder.final_layer_norm is not None: hidden_states = model.model.decoder.final_layer_norm(hidden_states) if model.model.decoder.project_out is not None: hidden_states = model.model.decoder.project_out(hidden_states) lm_logits = model.lm_head(hidden_states) shift_logits = lm_logits[:, :-1, :].contiguous() shift_labels = testenc[:, (i * seqlen):((i + 1) * seqlen)][:, 1:] loss_fct = nn.CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) neg_log_likelihood = loss.float() * seqlen nlls.append(neg_log_likelihood) ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * seqlen)) print(ppl.item()) model.config.use_cache = use_cache def main(): tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True) trainloader,testenc = get_wikitext2(128, 0, 2048, pretrained_model_dir) quantize_config = BaseQuantizeConfig( bits=4, # quantize model to 4-bit group_size=128, # it is recommended to set the value to 128 ) # load un-quantized model, the model will always be force loaded into cpu model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config) # quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask" # with value under torch.LongTensor type. model.quantize(trainloader, use_triton=False) # save quantized model model.save_quantized(quantized_model_dir) # save quantized model using safetensors model.save_quantized(quantized_model_dir, use_safetensors=True) # load quantized model, currently only support cpu or single gpu model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", use_triton=False) opt_eval(model.model, testenc, "cuda:0") if __name__ == "__main__": import logging logging.basicConfig( format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S" ) main()