MCPcopy Index your code
hub / github.com/NVIDIA/TensorRT-LLM / _Runtime

Class _Runtime

tensorrt_llm/runtime/generation.py:211–606  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

209
210
211class _Runtime(object):
212 runtime_rank: int
213 runtime: trt.Runtime
214 engine: trt.ICudaEngine
215 ctx_context: trt.IExecutionContext
216 context_0: trt.IExecutionContext
217 context_1: trt.IExecutionContext
218 profiler: _Profiler
219 engine_inspector: trt.EngineInspector
220 cuda_graph_instances: List[cudart.cudaGraphExec_t]
221 input_tensor_names: Set[str]
222 output_tensor_names: Set[str]
223
224 def __init__(self, engine_buffer, mapping: Mapping):
225 self.address = None
226 self.device_memory_size = 0
227 self.__prepare(mapping, engine_buffer)
228
229 def _serialize_engine(self) -> trt.IHostMemory:
230 return self.engine.serialize()
231
232 def __create_and_setup_context(self, address, size, profile_idx,
233 stream) -> trt.IExecutionContext:
234 context = self.engine.create_execution_context_without_device_memory()
235 assert context is not None, "Failed to create an execution context with the provided device memory!"
236 context.set_device_memory(address, size)
237 context.set_optimization_profile_async(profile_idx, stream)
238 # If nvtx verbosity is DETAILED, change it to LAYER_NAMES_ONLY for inference performance
239 if context.nvtx_verbosity == trt.ProfilingVerbosity.DETAILED:
240 context.nvtx_verbosity = trt.ProfilingVerbosity.LAYER_NAMES_ONLY
241 return context
242
243 def _set_profiler(self):
244 if self.profiler is not None:
245 return
246 assert self.context_0 is not None
247 assert self.context_1 is not None
248 self.profiler = _Profiler()
249 self.context_0.profiler = self.profiler
250 self.context_0.enqueue_emits_profile = False
251 self.context_1.profiler = self.profiler
252 self.context_1.enqueue_emits_profile = False
253 if self.engine.num_optimization_profiles == 2:
254 assert self.ctx_context is not None
255 self.ctx_context.profiler = self.profiler
256 self.ctx_context.enqueue_emits_profile = False
257
258 def __prepare(self, mapping: Mapping, engine_buffer):
259 self.runtime_rank = mapping.rank
260 local_rank = self.runtime_rank % mapping.gpus_per_node
261 if DISABLE_TORCH_DEVICE_SET:
262 CUASSERT(cudart.cudaSetDevice(torch.cuda.current_device()))
263 else:
264 torch.cuda.set_device(local_rank)
265 CUASSERT(cudart.cudaSetDevice(local_rank))
266
267 self.runtime = trt.Runtime(logger.trt_logger)
268 self.engine = self.runtime.deserialize_cuda_engine(engine_buffer)

Callers 3

_from_hf_modelMethod · 0.90
__init__Method · 0.85

Calls

no outgoing calls

Tested by 2

_from_hf_modelMethod · 0.72