(model_name, gptq_config: GptqConfig)
| 27 | |
| 28 | |
| 29 | def load_gptq_quantized(model_name, gptq_config: GptqConfig): |
| 30 | print("Loading GPTQ quantized model...") |
| 31 | |
| 32 | try: |
| 33 | script_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) |
| 34 | module_path = os.path.join(script_path, "../repositories/GPTQ-for-LLaMa") |
| 35 | |
| 36 | sys.path.insert(0, module_path) |
| 37 | from llama import load_quant |
| 38 | except ImportError as e: |
| 39 | print(f"Error: Failed to load GPTQ-for-LLaMa. {e}") |
| 40 | print("See https://github.com/lm-sys/FastChat/blob/main/docs/gptq.md") |
| 41 | sys.exit(-1) |
| 42 | |
| 43 | tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) |
| 44 | # only `fastest-inference-4bit` branch cares about `act_order` |
| 45 | if gptq_config.act_order: |
| 46 | model = load_quant( |
| 47 | model_name, |
| 48 | find_gptq_ckpt(gptq_config), |
| 49 | gptq_config.wbits, |
| 50 | gptq_config.groupsize, |
| 51 | act_order=gptq_config.act_order, |
| 52 | ) |
| 53 | else: |
| 54 | # other branches |
| 55 | model = load_quant( |
| 56 | model_name, |
| 57 | find_gptq_ckpt(gptq_config), |
| 58 | gptq_config.wbits, |
| 59 | gptq_config.groupsize, |
| 60 | ) |
| 61 | |
| 62 | return model, tokenizer |
| 63 | |
| 64 | |
| 65 | def find_gptq_ckpt(gptq_config: GptqConfig): |
no test coverage detected
searching dependent graphs…