MCPcopy
hub / github.com/lm-sys/FastChat / load_gptq_quantized

Function load_gptq_quantized

fastchat/modules/gptq.py:29–62  ·  view source on GitHub ↗
(model_name, gptq_config: GptqConfig)

Source from the content-addressed store, hash-verified

27
28
29def load_gptq_quantized(model_name, gptq_config: GptqConfig):
30 print("Loading GPTQ quantized model...")
31
32 try:
33 script_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
34 module_path = os.path.join(script_path, "../repositories/GPTQ-for-LLaMa")
35
36 sys.path.insert(0, module_path)
37 from llama import load_quant
38 except ImportError as e:
39 print(f"Error: Failed to load GPTQ-for-LLaMa. {e}")
40 print("See https://github.com/lm-sys/FastChat/blob/main/docs/gptq.md")
41 sys.exit(-1)
42
43 tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
44 # only `fastest-inference-4bit` branch cares about `act_order`
45 if gptq_config.act_order:
46 model = load_quant(
47 model_name,
48 find_gptq_ckpt(gptq_config),
49 gptq_config.wbits,
50 gptq_config.groupsize,
51 act_order=gptq_config.act_order,
52 )
53 else:
54 # other branches
55 model = load_quant(
56 model_name,
57 find_gptq_ckpt(gptq_config),
58 gptq_config.wbits,
59 gptq_config.groupsize,
60 )
61
62 return model, tokenizer
63
64
65def find_gptq_ckpt(gptq_config: GptqConfig):

Callers 1

load_modelFunction · 0.90

Calls 1

find_gptq_ckptFunction · 0.85

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…