hub / github.com/NVIDIA/TensorRT-LLM / __init__

Method init

tensorrt_llm/runtime/generation.py:880–1156 · view source on GitHub ↗

(self,
                 model_config: ModelConfig,
                 engine_buffer,
                 mapping: Mapping,
                 debug_mode=False,
                 debug_tensors_to_save=None,
                 cuda_graph_mode=False,
                 stream: torch.cuda.Stream = None)

Source from the content-addressed store, hash-verified

878	medusa_temperature: float = 0.0
879
880	def __init__(self,
881	model_config: ModelConfig,
882	engine_buffer,
883	mapping: Mapping,
884	debug_mode=False,
885	debug_tensors_to_save=None,
886	cuda_graph_mode=False,
887	stream: torch.cuda.Stream = None):
888	assert isinstance(model_config, ModelConfig)
889	self._model_config = model_config
890	self.mapping = mapping
891	self.runtime = _Runtime(engine_buffer, mapping)
892	if DISABLE_TORCH_DEVICE_SET:
893	self.device = torch.device(f'cuda:{torch.cuda.current_device()}')
894	else:
895	self.device = torch.device(
896	f'cuda:{self.runtime.runtime_rank % mapping.gpus_per_node}')
897	torch.cuda.set_device(self.device)
898	# dynamic_decoder currently use torch's current stream, so must let TRT enqueue use same stream here
899	self.stream = stream
900	if self.stream is None:
901	self.stream = torch.cuda.Stream(self.device)
902	torch.cuda.set_stream(self.stream)
903	self.debug_mode = debug_mode
904	self.debug_tensors_to_save = debug_tensors_to_save
905
906	self.cuda_graph_mode = cuda_graph_mode
907	# Optional inputs for dynamic decoder
908	self.top_p_decay = None
909	self.top_p_min = None
910	self.top_p_reset_ids = None
911	# TODO: in tensorrt_llm/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp it's T, can be float or half?
912	self.embedding_bias_opt = None
913
914	self.buffer = None
915	self.buffer_allocated = False
916
917	self.vocab_size_padded = pad_vocab_size(self.vocab_size,
918	self.mapping.tp_size)
919	if len(model_config.layer_types) == 0:
920	self.layer_types = ['attention'] * model_config.num_layers
921	else:
922	layer_types = model_config.layer_types
923	layer_types = layer_types * (model_config.num_layers //
924	len(layer_types))
925	layer_types = layer_types + layer_types[0:(model_config.num_layers %
926	len(layer_types))]
927	self.layer_types = layer_types
928	self.num_attn_layers = \
929	self.layer_types[self.first_layer:self.last_layer].count('attention')
930	self.has_attn_layers = self.num_attn_layers > 0
931	self.has_rnn_layers = 'recurrent' in self.layer_types[
932	self.first_layer:self.last_layer]
933
934	self.attn_to_general_idx = {}
935	self.general_to_attn_idx = {}
936	attn_layer_idx = 0
937	for i in range(self.first_layer, self.last_layer):

Callers

nothing calls this directly

Calls 15

_tensor_dtypeMethod · 0.95

_RuntimeClass · 0.85

pad_vocab_sizeFunction · 0.85

get_redrafter_tensor_namesFunction · 0.85

allocate_workspaceMethod · 0.80

max_workspace_size_autoMethod · 0.80

get_missing_qkv_modulesMethod · 0.80

deviceMethod · 0.45

warningMethod · 0.45

has_ppMethod · 0.45

is_last_pp_rankMethod · 0.45

is_first_pp_rankMethod · 0.45

Tested by

no test coverage detected

Method __init__

Source from the content-addressed store, hash-verified

Callers

Calls 15

Tested by

Method init