MCPcopy
hub / github.com/NVIDIA/TensorRT-LLM / smooth_quantize_plugin

Function smooth_quantize_plugin

tensorrt_llm/quantization/quantize.py:187–221  ·  view source on GitHub ↗
(model, quant_mode)

Source from the content-addressed store, hash-verified

185
186
187def smooth_quantize_plugin(model, quant_mode):
188 quant_map = {
189 RmsNorm: SmoothQuantRmsNorm,
190 LayerNorm: SmoothQuantLayerNorm,
191 GatedMLP: SmoothQuantGatedMLP,
192 MLP: SmoothQuantMLP,
193 Attention: SmoothQuantAttention,
194 }
195 for name, layer, parent in model.named_modules_with_parent():
196 layer_name = name.rsplit('.', 1)[-1]
197 if layer_name in ['ln_f', 'ln_embed']:
198 continue
199
200 quant_cls = None
201 for cls in quant_map:
202 if isinstance(layer, cls):
203 quant_cls = quant_map[cls]
204 break
205
206 if quant_cls is None:
207 continue
208
209 init_params = get_init_params(layer, quant_cls)
210 init_params["quant_mode"] = quant_mode
211 if isinstance(layer, Attention):
212 init_params[
213 "num_attention_heads"] = layer.num_attention_heads * layer.tp_size
214 quant_layer = quant_cls(**init_params)
215 if parent is not None:
216 setattr(parent, layer_name, quant_layer)
217 else:
218 model = quant_layer
219
220 setattr(model, 'quant_mode', quant_mode)
221 return model
222
223
224def smooth_quantize(model, quant_config: QuantConfig):

Callers 1

smooth_quantizeFunction · 0.85

Calls 2

get_init_paramsFunction · 0.85

Tested by

no test coverage detected