MCPcopy Index your code
hub / github.com/NVIDIA/TensorRT-LLM / __init__

Method __init__

tensorrt_llm/runtime/generation.py:880–1156  ·  view source on GitHub ↗
(self,
                 model_config: ModelConfig,
                 engine_buffer,
                 mapping: Mapping,
                 debug_mode=False,
                 debug_tensors_to_save=None,
                 cuda_graph_mode=False,
                 stream: torch.cuda.Stream = None)

Source from the content-addressed store, hash-verified

878 medusa_temperature: float = 0.0
879
880 def __init__(self,
881 model_config: ModelConfig,
882 engine_buffer,
883 mapping: Mapping,
884 debug_mode=False,
885 debug_tensors_to_save=None,
886 cuda_graph_mode=False,
887 stream: torch.cuda.Stream = None):
888 assert isinstance(model_config, ModelConfig)
889 self._model_config = model_config
890 self.mapping = mapping
891 self.runtime = _Runtime(engine_buffer, mapping)
892 if DISABLE_TORCH_DEVICE_SET:
893 self.device = torch.device(f'cuda:{torch.cuda.current_device()}')
894 else:
895 self.device = torch.device(
896 f'cuda:{self.runtime.runtime_rank % mapping.gpus_per_node}')
897 torch.cuda.set_device(self.device)
898 # dynamic_decoder currently use torch's current stream, so must let TRT enqueue use same stream here
899 self.stream = stream
900 if self.stream is None:
901 self.stream = torch.cuda.Stream(self.device)
902 torch.cuda.set_stream(self.stream)
903 self.debug_mode = debug_mode
904 self.debug_tensors_to_save = debug_tensors_to_save
905
906 self.cuda_graph_mode = cuda_graph_mode
907 # Optional inputs for dynamic decoder
908 self.top_p_decay = None
909 self.top_p_min = None
910 self.top_p_reset_ids = None
911 # TODO: in tensorrt_llm/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp it's T, can be float or half?
912 self.embedding_bias_opt = None
913
914 self.buffer = None
915 self.buffer_allocated = False
916
917 self.vocab_size_padded = pad_vocab_size(self.vocab_size,
918 self.mapping.tp_size)
919 if len(model_config.layer_types) == 0:
920 self.layer_types = ['attention'] * model_config.num_layers
921 else:
922 layer_types = model_config.layer_types
923 layer_types = layer_types * (model_config.num_layers //
924 len(layer_types))
925 layer_types = layer_types + layer_types[0:(model_config.num_layers %
926 len(layer_types))]
927 self.layer_types = layer_types
928 self.num_attn_layers = \
929 self.layer_types[self.first_layer:self.last_layer].count('attention')
930 self.has_attn_layers = self.num_attn_layers > 0
931 self.has_rnn_layers = 'recurrent' in self.layer_types[
932 self.first_layer:self.last_layer]
933
934 self.attn_to_general_idx = {}
935 self.general_to_attn_idx = {}
936 attn_layer_idx = 0
937 for i in range(self.first_layer, self.last_layer):

Callers

nothing calls this directly

Calls 15

_tensor_dtypeMethod · 0.95
_RuntimeClass · 0.85
pad_vocab_sizeFunction · 0.85
allocate_workspaceMethod · 0.80
deviceMethod · 0.45
warningMethod · 0.45
has_ppMethod · 0.45
is_last_pp_rankMethod · 0.45
is_first_pp_rankMethod · 0.45

Tested by

no test coverage detected