(self,
config,
debug_mode=True,
stream: torch.cuda.Stream = None)
| 30 | class TllmDiT(object): |
| 31 | |
| 32 | def __init__(self, |
| 33 | config, |
| 34 | debug_mode=True, |
| 35 | stream: torch.cuda.Stream = None): |
| 36 | self.dtype = config['pretrained_config']['dtype'] |
| 37 | |
| 38 | rank = tensorrt_llm.mpi_rank() |
| 39 | world_size = config['pretrained_config']['mapping']['world_size'] |
| 40 | cp_size = config['pretrained_config']['mapping']['cp_size'] |
| 41 | tp_size = config['pretrained_config']['mapping']['tp_size'] |
| 42 | pp_size = config['pretrained_config']['mapping']['pp_size'] |
| 43 | assert pp_size == 1 |
| 44 | self.mapping = tensorrt_llm.Mapping(world_size=world_size, |
| 45 | rank=rank, |
| 46 | cp_size=cp_size, |
| 47 | tp_size=tp_size, |
| 48 | pp_size=1, |
| 49 | gpus_per_node=args.gpus_per_node) |
| 50 | |
| 51 | local_rank = rank % self.mapping.gpus_per_node |
| 52 | self.device = torch.device(f'cuda:{local_rank}') |
| 53 | torch.cuda.set_device(self.device) |
| 54 | CUASSERT(cudart.cudaSetDevice(local_rank)) |
| 55 | |
| 56 | self.stream = stream |
| 57 | if self.stream is None: |
| 58 | self.stream = torch.cuda.Stream(self.device) |
| 59 | torch.cuda.set_stream(self.stream) |
| 60 | |
| 61 | engine_file = os.path.join(args.tllm_model_dir, f"rank{rank}.engine") |
| 62 | logger.info(f'Loading engine from {engine_file}') |
| 63 | with open(engine_file, "rb") as f: |
| 64 | engine_buffer = f.read() |
| 65 | |
| 66 | assert engine_buffer is not None |
| 67 | |
| 68 | self.session = Session.from_serialized_engine(engine_buffer) |
| 69 | |
| 70 | self.debug_mode = debug_mode |
| 71 | |
| 72 | self.inputs = {} |
| 73 | self.outputs = {} |
| 74 | self.buffer_allocated = False |
| 75 | |
| 76 | expected_tensor_names = ['latent', 'timestep', 'label', 'output'] |
| 77 | |
| 78 | if self.mapping.tp_size > 1: |
| 79 | self.buffer, self.all_reduce_workspace = CustomAllReduceHelper.allocate_workspace( |
| 80 | self.mapping, |
| 81 | CustomAllReduceHelper.max_workspace_size_auto( |
| 82 | self.mapping.tp_size)) |
| 83 | self.inputs['all_reduce_workspace'] = self.all_reduce_workspace |
| 84 | expected_tensor_names += ['all_reduce_workspace'] |
| 85 | |
| 86 | found_tensor_names = [ |
| 87 | self.session.engine.get_tensor_name(i) |
| 88 | for i in range(self.session.engine.num_io_tensors) |
| 89 | ] |
nothing calls this directly
no test coverage detected