MCPcopy Index your code
hub / github.com/zai-org/CodeGeeX2 / get_model

Function get_model

demo/run_demo.py:35–78  ·  view source on GitHub ↗
(args)

Source from the content-addressed store, hash-verified

33
34
35def get_model(args):
36 if not args.cpu:
37 if torch.cuda.is_available():
38 device = f"cuda:{args.gpu}"
39 elif torch.backends.mps.is_built():
40 device = "mps"
41 else:
42 device = "cpu"
43 else:
44 device = "cpu"
45
46 tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
47
48 if args.n_gpus > 1 and enable_multiple_gpus:
49 # 如需实现多显卡模型加载,传入"n_gpus"为需求的显卡数量 / To enable Multiple GPUs model loading, please adjust "n_gpus" to the desired number of graphics cards.
50 print(f"Runing on {args.n_gpus} GPUs.")
51 model = load_model_on_gpus(args.model_path, num_gpus=args.n_gpus)
52 model = model.eval()
53 elif enable_chatglm_cpp and args.chatglm_cpp:
54 print("Using chatglm-cpp to improve performance")
55 dtype = "f16"
56 if args.quantize in [4, 5, 8]:
57 dtype = f"q{args.quantize}_0"
58 model = chatglm_cpp.Pipeline(args.model_path, dtype=dtype)
59 else:
60 model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True)
61 model = model.eval()
62
63 if enable_fastllm and args.fastllm:
64 print("fastllm enabled.")
65 model = model.half()
66 llm.set_device_map(device)
67 if args.quantize in [4, 8]:
68 model = llm.from_hf(model, dtype=f"int{args.quantize}")
69 else:
70 model = llm.from_hf(model, dtype="float16")
71 else:
72 print("chatglm-cpp and fastllm not installed, using transformers.")
73 if args.quantize in [4, 8]:
74 print(f"Model is quantized to INT{args.quantize} format.")
75 model = model.half().quantize(args.quantize)
76 model = model.to(device)
77
78 return tokenizer, model
79
80
81def add_code_generation_args(parser):

Callers 1

mainFunction · 0.85

Calls 1

load_model_on_gpusFunction · 0.90

Tested by

no test coverage detected