MCPcopy
hub / github.com/NVIDIA/TensorRT-LLM / QuantConfig

Class QuantConfig

tensorrt_llm/models/modeling_utils.py:131–271  ·  view source on GitHub ↗

Serializable quantization configuration class, part of the PretrainedConfig. Args: quant_algo (tensorrt_llm.quantization.mode.QuantAlgo, optional): Quantization algorithm. Defaults to None. kv_cache_quant_algo (tensorrt_llm.quantization.mode.QuantAlgo, optional): KV cache q

Source from the content-addressed store, hash-verified

129
130@dataclasses.dataclass
131class QuantConfig:
132 """
133 Serializable quantization configuration class, part of the PretrainedConfig.
134
135 Args:
136 quant_algo (tensorrt_llm.quantization.mode.QuantAlgo, optional): Quantization algorithm. Defaults to None.
137 kv_cache_quant_algo (tensorrt_llm.quantization.mode.QuantAlgo, optional): KV cache quantization algorithm. Defaults to None.
138 group_size (int): The group size for group-wise quantization. Defaults to 128.
139 smoothquant_val (float): The smoothing parameter alpha used in smooth quant. Defaults to 0.5.
140 clamp_val (List[float], optional): The clamp values used in FP8 rowwise quantization. Defaults to None.
141 use_meta_recipe (bool): Whether to use Meta's recipe for FP8 rowwise quantization. Defaults to False.
142 has_zero_point (bool): Whether to use zero point for quantization. Defaults to False.
143 pre_quant_scale (bool): Whether to use pre-quant scale for quantization. Defaults to False.
144 exclude_modules (List[str], optional): The module name patterns that are skipped in quantization. Defaults to None.
145 mamba_ssm_cache_dtype (str, optional): The data type for mamba SSM cache. Defaults to None.
146 """
147 quant_algo: Optional[QuantAlgo] = None
148 kv_cache_quant_algo: Optional[QuantAlgo] = None
149 group_size: int = 128
150 smoothquant_val: float = 0.5
151 clamp_val: Optional[List[float]] = None
152 use_meta_recipe: bool = False
153 has_zero_point: bool = False
154 pre_quant_scale: bool = False
155 exclude_modules: Optional[List[str]] = None
156 mamba_ssm_cache_dtype: Optional[str] = None
157
158 @cached_property
159 def quant_mode(self) -> QuantModeWrapper:
160 quant_mode_list = [
161 QuantMode.from_quant_algo(
162 self.quant_algo,
163 self.kv_cache_quant_algo,
164 )
165 ]
166 return QuantModeWrapper(quant_mode_list)
167
168 @cached_property
169 def layer_quant_mode(self) -> QuantMode:
170 return QuantMode.from_quant_algo(
171 self.quant_algo,
172 self.kv_cache_quant_algo,
173 )
174
175 @property
176 def _use_plugin_sq(self):
177 return self.quant_algo in W8A8_SQ_PLUGIN_LIST
178
179 @property
180 def _requires_calibration(self):
181 return self.quant_algo in (set(QUANT_ALGO_LIST) - {
182 QuantAlgo.W8A16, QuantAlgo.W4A16,
183 QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN
184 }) or self.kv_cache_quant_algo in KV_CACHE_QUANT_ALGO_LIST
185
186 @property
187 def _requires_modelopt_quantization(self):
188 if self.quant_algo in [

Callers 15

get_quant_configFunction · 0.90
test_fp8_rowwiseMethod · 0.90
test_smooth_quantMethod · 0.90
test_int4_awqMethod · 0.90
test_fp8Method · 0.90
test_fp8_pp2Method · 0.90
test_fp8_rowwiseMethod · 0.90
test_quant_tp4Method · 0.90
test_fp8Method · 0.90
test_fp8Method · 0.90

Calls

no outgoing calls

Tested by 15

test_fp8_rowwiseMethod · 0.72
test_smooth_quantMethod · 0.72
test_int4_awqMethod · 0.72
test_fp8Method · 0.72
test_fp8_pp2Method · 0.72
test_fp8_rowwiseMethod · 0.72
test_quant_tp4Method · 0.72
test_fp8Method · 0.72
test_fp8Method · 0.72