hub / github.com/zai-org/CodeGeeX2 / get_model

Function get_model

demo/run_demo.py:35–78 · view source on GitHub ↗

(args)

Source from the content-addressed store, hash-verified

33
34
35	def get_model(args):
36	if not args.cpu:
37	if torch.cuda.is_available():
38	device = f"cuda:{args.gpu}"
39	elif torch.backends.mps.is_built():
40	device = "mps"
41	else:
42	device = "cpu"
43	else:
44	device = "cpu"
45
46	tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
47
48	if args.n_gpus > 1 and enable_multiple_gpus:
49	# 如需实现多显卡模型加载,传入"n_gpus"为需求的显卡数量 / To enable Multiple GPUs model loading, please adjust "n_gpus" to the desired number of graphics cards.
50	print(f"Runing on {args.n_gpus} GPUs.")
51	model = load_model_on_gpus(args.model_path, num_gpus=args.n_gpus)
52	model = model.eval()
53	elif enable_chatglm_cpp and args.chatglm_cpp:
54	print("Using chatglm-cpp to improve performance")
55	dtype = "f16"
56	if args.quantize in [4, 5, 8]:
57	dtype = f"q{args.quantize}_0"
58	model = chatglm_cpp.Pipeline(args.model_path, dtype=dtype)
59	else:
60	model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
61	model = model.eval()
62
63	if enable_fastllm and args.fastllm:
64	print("fastllm enabled.")
65	model = model.half()
66	llm.set_device_map(device)
67	if args.quantize in [4, 8]:
68	model = llm.from_hf(model, dtype=f"int{args.quantize}")
69	else:
70	model = llm.from_hf(model, dtype="float16")
71	else:
72	print("chatglm-cpp and fastllm not installed, using transformers.")
73	if args.quantize in [4, 8]:
74	print(f"Model is quantized to INT{args.quantize} format.")
75	model = model.half().quantize(args.quantize)
76	model = model.to(device)
77
78	return tokenizer, model
79
80
81	def add_code_generation_args(parser):

Callers 1

mainFunction · 0.85

Calls 1

load_model_on_gpusFunction · 0.90

Tested by

no test coverage detected