Function load_gptq_quantized

fastchat/modules/gptq.py:29–62 · view source on GitHub ↗

(model_name, gptq_config: GptqConfig)

Source from the content-addressed store, hash-verified

27
28
29	def load_gptq_quantized(model_name, gptq_config: GptqConfig):
30	print("Loading GPTQ quantized model...")
31
32	try:
33	script_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
34	module_path = os.path.join(script_path, "../repositories/GPTQ-for-LLaMa")
35
36	sys.path.insert(0, module_path)
37	from llama import load_quant
38	except ImportError as e:
39	print(f"Error: Failed to load GPTQ-for-LLaMa. {e}")
40	print("See https://github.com/lm-sys/FastChat/blob/main/docs/gptq.md")
41	sys.exit(-1)
42
43	tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
44	# only `fastest-inference-4bit` branch cares about `act_order`
45	if gptq_config.act_order:
46	model = load_quant(
47	model_name,
48	find_gptq_ckpt(gptq_config),
49	gptq_config.wbits,
50	gptq_config.groupsize,
51	act_order=gptq_config.act_order,
52	)
53	else:
54	# other branches
55	model = load_quant(
56	model_name,
57	find_gptq_ckpt(gptq_config),
58	gptq_config.wbits,
59	gptq_config.groupsize,
60	)
61
62	return model, tokenizer
63
64
65	def find_gptq_ckpt(gptq_config: GptqConfig):

load_modelFunction · 0.90

find_gptq_ckptFunction · 0.85

no test coverage detected

searching dependent graphs…