| 209 | |
| 210 | |
| 211 | class _Runtime(object): |
| 212 | runtime_rank: int |
| 213 | runtime: trt.Runtime |
| 214 | engine: trt.ICudaEngine |
| 215 | ctx_context: trt.IExecutionContext |
| 216 | context_0: trt.IExecutionContext |
| 217 | context_1: trt.IExecutionContext |
| 218 | profiler: _Profiler |
| 219 | engine_inspector: trt.EngineInspector |
| 220 | cuda_graph_instances: List[cudart.cudaGraphExec_t] |
| 221 | input_tensor_names: Set[str] |
| 222 | output_tensor_names: Set[str] |
| 223 | |
| 224 | def __init__(self, engine_buffer, mapping: Mapping): |
| 225 | self.address = None |
| 226 | self.device_memory_size = 0 |
| 227 | self.__prepare(mapping, engine_buffer) |
| 228 | |
| 229 | def _serialize_engine(self) -> trt.IHostMemory: |
| 230 | return self.engine.serialize() |
| 231 | |
| 232 | def __create_and_setup_context(self, address, size, profile_idx, |
| 233 | stream) -> trt.IExecutionContext: |
| 234 | context = self.engine.create_execution_context_without_device_memory() |
| 235 | assert context is not None, "Failed to create an execution context with the provided device memory!" |
| 236 | context.set_device_memory(address, size) |
| 237 | context.set_optimization_profile_async(profile_idx, stream) |
| 238 | # If nvtx verbosity is DETAILED, change it to LAYER_NAMES_ONLY for inference performance |
| 239 | if context.nvtx_verbosity == trt.ProfilingVerbosity.DETAILED: |
| 240 | context.nvtx_verbosity = trt.ProfilingVerbosity.LAYER_NAMES_ONLY |
| 241 | return context |
| 242 | |
| 243 | def _set_profiler(self): |
| 244 | if self.profiler is not None: |
| 245 | return |
| 246 | assert self.context_0 is not None |
| 247 | assert self.context_1 is not None |
| 248 | self.profiler = _Profiler() |
| 249 | self.context_0.profiler = self.profiler |
| 250 | self.context_0.enqueue_emits_profile = False |
| 251 | self.context_1.profiler = self.profiler |
| 252 | self.context_1.enqueue_emits_profile = False |
| 253 | if self.engine.num_optimization_profiles == 2: |
| 254 | assert self.ctx_context is not None |
| 255 | self.ctx_context.profiler = self.profiler |
| 256 | self.ctx_context.enqueue_emits_profile = False |
| 257 | |
| 258 | def __prepare(self, mapping: Mapping, engine_buffer): |
| 259 | self.runtime_rank = mapping.rank |
| 260 | local_rank = self.runtime_rank % mapping.gpus_per_node |
| 261 | if DISABLE_TORCH_DEVICE_SET: |
| 262 | CUASSERT(cudart.cudaSetDevice(torch.cuda.current_device())) |
| 263 | else: |
| 264 | torch.cuda.set_device(local_rank) |
| 265 | CUASSERT(cudart.cudaSetDevice(local_rank)) |
| 266 | |
| 267 | self.runtime = trt.Runtime(logger.trt_logger) |
| 268 | self.engine = self.runtime.deserialize_cuda_engine(engine_buffer) |
no outgoing calls