35 lines
1,014 B
Python
35 lines
1,014 B
Python
from logging import getLogger
|
|
from os.path import join, isfile
|
|
from typing import Optional, Union
|
|
|
|
import accelerate
|
|
import torch
|
|
import transformers
|
|
from transformers import AutoConfig, AutoModelForCausalLM
|
|
|
|
from ._const import *
|
|
from ._utils import *
|
|
|
|
from ._base import *
|
|
from ..nn_modules.fused_llama_attn import FusedLlamaAttentionForQuantizedModel
|
|
from ..nn_modules.fused_llama_mlp import FusedLlamaMLPForQuantizedModel
|
|
|
|
logger = getLogger(__name__)
|
|
|
|
|
|
class LlamaGPTQForCausalLM(BaseGPTQForCausalLM):
|
|
layer_type = "LlamaDecoderLayer"
|
|
layers_block_name = "model.layers"
|
|
outside_layer_modules = ["model.embed_tokens", "model.norm"]
|
|
inside_layer_modules = [
|
|
["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
|
|
["self_attn.o_proj"],
|
|
["mlp.up_proj", "mlp.gate_proj"],
|
|
["mlp.down_proj"]
|
|
]
|
|
|
|
fused_attn_module_type = FusedLlamaAttentionForQuantizedModel
|
|
fused_mlp_module_type = FusedLlamaMLPForQuantizedModel
|
|
|
|
|
|
__all__ = ["LlamaGPTQForCausalLM"]
|