(self,
model_config: ModelConfig,
engine_buffer,
mapping: Mapping,
debug_mode=False,
debug_tensors_to_save=None,
cuda_graph_mode=False,
stream: torch.cuda.Stream = None)
| 878 | medusa_temperature: float = 0.0 |
| 879 | |
| 880 | def __init__(self, |
| 881 | model_config: ModelConfig, |
| 882 | engine_buffer, |
| 883 | mapping: Mapping, |
| 884 | debug_mode=False, |
| 885 | debug_tensors_to_save=None, |
| 886 | cuda_graph_mode=False, |
| 887 | stream: torch.cuda.Stream = None): |
| 888 | assert isinstance(model_config, ModelConfig) |
| 889 | self._model_config = model_config |
| 890 | self.mapping = mapping |
| 891 | self.runtime = _Runtime(engine_buffer, mapping) |
| 892 | if DISABLE_TORCH_DEVICE_SET: |
| 893 | self.device = torch.device(f'cuda:{torch.cuda.current_device()}') |
| 894 | else: |
| 895 | self.device = torch.device( |
| 896 | f'cuda:{self.runtime.runtime_rank % mapping.gpus_per_node}') |
| 897 | torch.cuda.set_device(self.device) |
| 898 | # dynamic_decoder currently use torch's current stream, so must let TRT enqueue use same stream here |
| 899 | self.stream = stream |
| 900 | if self.stream is None: |
| 901 | self.stream = torch.cuda.Stream(self.device) |
| 902 | torch.cuda.set_stream(self.stream) |
| 903 | self.debug_mode = debug_mode |
| 904 | self.debug_tensors_to_save = debug_tensors_to_save |
| 905 | |
| 906 | self.cuda_graph_mode = cuda_graph_mode |
| 907 | # Optional inputs for dynamic decoder |
| 908 | self.top_p_decay = None |
| 909 | self.top_p_min = None |
| 910 | self.top_p_reset_ids = None |
| 911 | # TODO: in tensorrt_llm/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp it's T, can be float or half? |
| 912 | self.embedding_bias_opt = None |
| 913 | |
| 914 | self.buffer = None |
| 915 | self.buffer_allocated = False |
| 916 | |
| 917 | self.vocab_size_padded = pad_vocab_size(self.vocab_size, |
| 918 | self.mapping.tp_size) |
| 919 | if len(model_config.layer_types) == 0: |
| 920 | self.layer_types = ['attention'] * model_config.num_layers |
| 921 | else: |
| 922 | layer_types = model_config.layer_types |
| 923 | layer_types = layer_types * (model_config.num_layers // |
| 924 | len(layer_types)) |
| 925 | layer_types = layer_types + layer_types[0:(model_config.num_layers % |
| 926 | len(layer_types))] |
| 927 | self.layer_types = layer_types |
| 928 | self.num_attn_layers = \ |
| 929 | self.layer_types[self.first_layer:self.last_layer].count('attention') |
| 930 | self.has_attn_layers = self.num_attn_layers > 0 |
| 931 | self.has_rnn_layers = 'recurrent' in self.layer_types[ |
| 932 | self.first_layer:self.last_layer] |
| 933 | |
| 934 | self.attn_to_general_idx = {} |
| 935 | self.general_to_attn_idx = {} |
| 936 | attn_layer_idx = 0 |
| 937 | for i in range(self.first_layer, self.last_layer): |
nothing calls this directly
no test coverage detected