hub / github.com/NVIDIA/TensorRT-LLM / _Runtime

Class _Runtime

tensorrt_llm/runtime/generation.py:211–606 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

209
210
211	class _Runtime(object):
212	runtime_rank: int
213	runtime: trt.Runtime
214	engine: trt.ICudaEngine
215	ctx_context: trt.IExecutionContext
216	context_0: trt.IExecutionContext
217	context_1: trt.IExecutionContext
218	profiler: _Profiler
219	engine_inspector: trt.EngineInspector
220	cuda_graph_instances: List[cudart.cudaGraphExec_t]
221	input_tensor_names: Set[str]
222	output_tensor_names: Set[str]
223
224	def __init__(self, engine_buffer, mapping: Mapping):
225	self.address = None
226	self.device_memory_size = 0
227	self.__prepare(mapping, engine_buffer)
228
229	def _serialize_engine(self) -> trt.IHostMemory:
230	return self.engine.serialize()
231
232	def __create_and_setup_context(self, address, size, profile_idx,
233	stream) -> trt.IExecutionContext:
234	context = self.engine.create_execution_context_without_device_memory()
235	assert context is not None, "Failed to create an execution context with the provided device memory!"
236	context.set_device_memory(address, size)
237	context.set_optimization_profile_async(profile_idx, stream)
238	# If nvtx verbosity is DETAILED, change it to LAYER_NAMES_ONLY for inference performance
239	if context.nvtx_verbosity == trt.ProfilingVerbosity.DETAILED:
240	context.nvtx_verbosity = trt.ProfilingVerbosity.LAYER_NAMES_ONLY
241	return context
242
243	def _set_profiler(self):
244	if self.profiler is not None:
245	return
246	assert self.context_0 is not None
247	assert self.context_1 is not None
248	self.profiler = _Profiler()
249	self.context_0.profiler = self.profiler
250	self.context_0.enqueue_emits_profile = False
251	self.context_1.profiler = self.profiler
252	self.context_1.enqueue_emits_profile = False
253	if self.engine.num_optimization_profiles == 2:
254	assert self.ctx_context is not None
255	self.ctx_context.profiler = self.profiler
256	self.ctx_context.enqueue_emits_profile = False
257
258	def __prepare(self, mapping: Mapping, engine_buffer):
259	self.runtime_rank = mapping.rank
260	local_rank = self.runtime_rank % mapping.gpus_per_node
261	if DISABLE_TORCH_DEVICE_SET:
262	CUASSERT(cudart.cudaSetDevice(torch.cuda.current_device()))
263	else:
264	torch.cuda.set_device(local_rank)
265	CUASSERT(cudart.cudaSetDevice(local_rank))
266
267	self.runtime = trt.Runtime(logger.trt_logger)
268	self.engine = self.runtime.deserialize_cuda_engine(engine_buffer)

Callers 3

_from_hf_modelMethod · 0.90

_from_fp8_quantized_engineMethod · 0.90

__init__Method · 0.85

Calls

no outgoing calls

Tested by 2

_from_hf_modelMethod · 0.72

_from_fp8_quantized_engineMethod · 0.72