Build model and init executor using huggingface model config and fake weights, useful for benchmarking
(args,
model_tag,
hf_model_dir,
dtype,
load_weights,
tp,
pp,
rank=0)
| 177 | |
| 178 | |
| 179 | def build_from_hf(args, |
| 180 | model_tag, |
| 181 | hf_model_dir, |
| 182 | dtype, |
| 183 | load_weights, |
| 184 | tp, |
| 185 | pp, |
| 186 | rank=0): |
| 187 | '''Build model and init executor using huggingface model config and fake weights, useful for benchmarking |
| 188 | ''' |
| 189 | status, = cudart.cudaSetDevice(rank) |
| 190 | assert status == cudart.cudaError_t.cudaSuccess, f"cuda set device to {rank} errored: {status}" |
| 191 | log_level(args) |
| 192 | mpi_barrier() |
| 193 | world_size = tp * pp |
| 194 | # TODO: Only build 1 rank for now, all the ranks shall have similar build time |
| 195 | # shall we build all ranks in parallel? |
| 196 | mapping = Mapping(world_size=world_size, rank=rank, tp_size=tp, pp_size=pp) |
| 197 | |
| 198 | quant_config = None |
| 199 | if args.quant == 'fp8': |
| 200 | quant_config = QuantConfig(QuantAlgo.FP8) |
| 201 | |
| 202 | phase_and_time = [] |
| 203 | if load_weights: |
| 204 | quant_output_dir = tempfile.TemporaryDirectory(model_tag) |
| 205 | start = time.time() |
| 206 | if args.quant is None: |
| 207 | trtllm_model = AutoModelForCausalLM.from_hugging_face( |
| 208 | hf_model_dir, |
| 209 | dtype, |
| 210 | mapping, |
| 211 | load_model_on_cpu=args.load_to_cpu) |
| 212 | else: |
| 213 | model_cls = AutoModelForCausalLM.get_trtllm_model_class( |
| 214 | hf_model_dir) |
| 215 | if rank == 0: |
| 216 | model_cls.quantize(hf_model_dir, |
| 217 | output_dir=quant_output_dir.name, |
| 218 | dtype=args.dtype, |
| 219 | mapping=mapping, |
| 220 | quant_config=quant_config) |
| 221 | mpi_barrier( |
| 222 | ) # every rank must wait rank 0 to get the correct quantized checkpoint |
| 223 | trtllm_model = model_cls.from_checkpoint(quant_output_dir.name) |
| 224 | phase_and_time.append(('load_and_convert', time.time() - start)) |
| 225 | quant_output_dir.cleanup() |
| 226 | |
| 227 | else: # fake weights |
| 228 | trtllm_config = AutoConfig.from_hugging_face(hf_model_dir, |
| 229 | dtype, |
| 230 | mapping, |
| 231 | quant_config, |
| 232 | trust_remote_code=True) |
| 233 | trtllm_model = AutoModelForCausalLM.get_trtllm_model_class( |
| 234 | hf_model_dir)(trtllm_config) |
| 235 | |
| 236 | start = time.time() |
no test coverage detected