(self, config: PretrainedConfig)
| 1041 | class DecoderModel(PretrainedModel): |
| 1042 | |
| 1043 | def __init__(self, config: PretrainedConfig): |
| 1044 | self.check_config(config) |
| 1045 | super().__init__(config) |
| 1046 | |
| 1047 | self.mapping = self.config.mapping |
| 1048 | |
| 1049 | self.has_position_embedding = self.config.has_position_embedding |
| 1050 | type_vocab_size = self.config.type_vocab_size |
| 1051 | self.has_token_type_embedding = (type_vocab_size is not None) |
| 1052 | self.rescale_before_lm_head = self.config.rescale_before_lm_head |
| 1053 | |
| 1054 | # e.g. BART regular, T5 RMS |
| 1055 | self.layernorm_type = self.config.layernorm_type |
| 1056 | |
| 1057 | # e.g. BART true, T5 false |
| 1058 | self.has_attention_qkvo_bias = self.config.has_attention_qkvo_bias |
| 1059 | self.has_mlp_bias = self.config.has_mlp_bias |
| 1060 | |
| 1061 | # e.g. BART false, T5 true |
| 1062 | self.has_model_final_layernorm = self.config.has_model_final_layernorm |
| 1063 | self._dtype = self.config.dtype |
| 1064 | # no quantization considered for now |
| 1065 | self._kv_dtype = self._dtype |
| 1066 | self._logits_dtype = self.config.logits_dtype |
| 1067 | |
| 1068 | self.total_num_layers = self.config.num_hidden_layers |
| 1069 | self.num_layers = self.config.num_hidden_layers // self.mapping.pp_size |
| 1070 | |
| 1071 | self.hidden_size = self.config.hidden_size |
| 1072 | self.num_heads = self.config.num_attention_heads |
| 1073 | num_kv_heads = self.num_heads |
| 1074 | if num_kv_heads is None or num_kv_heads <= 0: |
| 1075 | num_kv_heads = self.num_heads |
| 1076 | self.num_kv_heads = num_kv_heads |
| 1077 | self.head_size = self.hidden_size // self.num_heads if self.config.head_size is None else self.config.head_size |
| 1078 | |
| 1079 | self.encoder_hidden_size = self.config.encoder_hidden_size |
| 1080 | self.encoder_num_heads = self.config.encoder_num_heads |
| 1081 | encoder_num_kv_heads = None if not hasattr( |
| 1082 | self.config, |
| 1083 | "encoder_num_kv_heads") else self.config.encoder_num_kv_heads |
| 1084 | if encoder_num_kv_heads is None or encoder_num_kv_heads <= 0: |
| 1085 | encoder_num_kv_heads = self.encoder_num_heads |
| 1086 | self.encoder_num_kv_heads = encoder_num_kv_heads |
| 1087 | self.encoder_head_size = self.encoder_hidden_size // self.num_heads if self.config.encoder_head_size is None else self.config.encoder_head_size |
| 1088 | |
| 1089 | self.has_position_embedding = self.config.has_position_embedding |
| 1090 | self.has_token_type_embedding = type_vocab_size is not None |
| 1091 | |
| 1092 | self.fp16_clamping = (self.config.dtype |
| 1093 | == 'float16') and (self.config.model_type |
| 1094 | in ['t5', 'pix2struct']) |
| 1095 | |
| 1096 | self.skip_cross_kv = self.config.skip_cross_kv |
| 1097 | self.mlp_type = MLPType.MLP if not hasattr( |
| 1098 | self.config, "mlp_type") else self.config.mlp_type |
| 1099 | self.use_implicit_relative_attention = self.config.use_implicit_relative_attention if hasattr( |
| 1100 | self.config, "use_implicit_relative_attention") else False |
nothing calls this directly
no test coverage detected