Serializable quantization configuration class, part of the PretrainedConfig. Args: quant_algo (tensorrt_llm.quantization.mode.QuantAlgo, optional): Quantization algorithm. Defaults to None. kv_cache_quant_algo (tensorrt_llm.quantization.mode.QuantAlgo, optional): KV cache q
| 129 | |
| 130 | @dataclasses.dataclass |
| 131 | class QuantConfig: |
| 132 | """ |
| 133 | Serializable quantization configuration class, part of the PretrainedConfig. |
| 134 | |
| 135 | Args: |
| 136 | quant_algo (tensorrt_llm.quantization.mode.QuantAlgo, optional): Quantization algorithm. Defaults to None. |
| 137 | kv_cache_quant_algo (tensorrt_llm.quantization.mode.QuantAlgo, optional): KV cache quantization algorithm. Defaults to None. |
| 138 | group_size (int): The group size for group-wise quantization. Defaults to 128. |
| 139 | smoothquant_val (float): The smoothing parameter alpha used in smooth quant. Defaults to 0.5. |
| 140 | clamp_val (List[float], optional): The clamp values used in FP8 rowwise quantization. Defaults to None. |
| 141 | use_meta_recipe (bool): Whether to use Meta's recipe for FP8 rowwise quantization. Defaults to False. |
| 142 | has_zero_point (bool): Whether to use zero point for quantization. Defaults to False. |
| 143 | pre_quant_scale (bool): Whether to use pre-quant scale for quantization. Defaults to False. |
| 144 | exclude_modules (List[str], optional): The module name patterns that are skipped in quantization. Defaults to None. |
| 145 | mamba_ssm_cache_dtype (str, optional): The data type for mamba SSM cache. Defaults to None. |
| 146 | """ |
| 147 | quant_algo: Optional[QuantAlgo] = None |
| 148 | kv_cache_quant_algo: Optional[QuantAlgo] = None |
| 149 | group_size: int = 128 |
| 150 | smoothquant_val: float = 0.5 |
| 151 | clamp_val: Optional[List[float]] = None |
| 152 | use_meta_recipe: bool = False |
| 153 | has_zero_point: bool = False |
| 154 | pre_quant_scale: bool = False |
| 155 | exclude_modules: Optional[List[str]] = None |
| 156 | mamba_ssm_cache_dtype: Optional[str] = None |
| 157 | |
| 158 | @cached_property |
| 159 | def quant_mode(self) -> QuantModeWrapper: |
| 160 | quant_mode_list = [ |
| 161 | QuantMode.from_quant_algo( |
| 162 | self.quant_algo, |
| 163 | self.kv_cache_quant_algo, |
| 164 | ) |
| 165 | ] |
| 166 | return QuantModeWrapper(quant_mode_list) |
| 167 | |
| 168 | @cached_property |
| 169 | def layer_quant_mode(self) -> QuantMode: |
| 170 | return QuantMode.from_quant_algo( |
| 171 | self.quant_algo, |
| 172 | self.kv_cache_quant_algo, |
| 173 | ) |
| 174 | |
| 175 | @property |
| 176 | def _use_plugin_sq(self): |
| 177 | return self.quant_algo in W8A8_SQ_PLUGIN_LIST |
| 178 | |
| 179 | @property |
| 180 | def _requires_calibration(self): |
| 181 | return self.quant_algo in (set(QUANT_ALGO_LIST) - { |
| 182 | QuantAlgo.W8A16, QuantAlgo.W4A16, |
| 183 | QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN |
| 184 | }) or self.kv_cache_quant_algo in KV_CACHE_QUANT_ALGO_LIST |
| 185 | |
| 186 | @property |
| 187 | def _requires_modelopt_quantization(self): |
| 188 | if self.quant_algo in [ |
no outgoing calls