(args)
| 33 | |
| 34 | |
| 35 | def get_model(args): |
| 36 | if not args.cpu: |
| 37 | if torch.cuda.is_available(): |
| 38 | device = f"cuda:{args.gpu}" |
| 39 | elif torch.backends.mps.is_built(): |
| 40 | device = "mps" |
| 41 | else: |
| 42 | device = "cpu" |
| 43 | else: |
| 44 | device = "cpu" |
| 45 | |
| 46 | tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) |
| 47 | |
| 48 | if args.n_gpus > 1 and enable_multiple_gpus: |
| 49 | # 如需实现多显卡模型加载,传入"n_gpus"为需求的显卡数量 / To enable Multiple GPUs model loading, please adjust "n_gpus" to the desired number of graphics cards. |
| 50 | print(f"Runing on {args.n_gpus} GPUs.") |
| 51 | model = load_model_on_gpus(args.model_path, num_gpus=args.n_gpus) |
| 52 | model = model.eval() |
| 53 | elif enable_chatglm_cpp and args.chatglm_cpp: |
| 54 | print("Using chatglm-cpp to improve performance") |
| 55 | dtype = "f16" |
| 56 | if args.quantize in [4, 5, 8]: |
| 57 | dtype = f"q{args.quantize}_0" |
| 58 | model = chatglm_cpp.Pipeline(args.model_path, dtype=dtype) |
| 59 | else: |
| 60 | model = AutoModel.from_pretrained(args.model_path, trust_remote_code=True) |
| 61 | model = model.eval() |
| 62 | |
| 63 | if enable_fastllm and args.fastllm: |
| 64 | print("fastllm enabled.") |
| 65 | model = model.half() |
| 66 | llm.set_device_map(device) |
| 67 | if args.quantize in [4, 8]: |
| 68 | model = llm.from_hf(model, dtype=f"int{args.quantize}") |
| 69 | else: |
| 70 | model = llm.from_hf(model, dtype="float16") |
| 71 | else: |
| 72 | print("chatglm-cpp and fastllm not installed, using transformers.") |
| 73 | if args.quantize in [4, 8]: |
| 74 | print(f"Model is quantized to INT{args.quantize} format.") |
| 75 | model = model.half().quantize(args.quantize) |
| 76 | model = model.to(device) |
| 77 | |
| 78 | return tokenizer, model |
| 79 | |
| 80 | |
| 81 | def add_code_generation_args(parser): |
no test coverage detected