Function smooth_quantize_plugin

tensorrt_llm/quantization/quantize.py:187–221 · view source on GitHub ↗

(model, quant_mode)

Source from the content-addressed store, hash-verified

185
186
187	def smooth_quantize_plugin(model, quant_mode):
188	quant_map = {
189	RmsNorm: SmoothQuantRmsNorm,
190	LayerNorm: SmoothQuantLayerNorm,
191	GatedMLP: SmoothQuantGatedMLP,
192	MLP: SmoothQuantMLP,
193	Attention: SmoothQuantAttention,
194	}
195	for name, layer, parent in model.named_modules_with_parent():
196	layer_name = name.rsplit('.', 1)[-1]
197	if layer_name in ['ln_f', 'ln_embed']:
198	continue
199
200	quant_cls = None
201	for cls in quant_map:
202	if isinstance(layer, cls):
203	quant_cls = quant_map[cls]
204	break
205
206	if quant_cls is None:
207	continue
208
209	init_params = get_init_params(layer, quant_cls)
210	init_params["quant_mode"] = quant_mode
211	if isinstance(layer, Attention):
212	init_params[
213	"num_attention_heads"] = layer.num_attention_heads * layer.tp_size
214	quant_layer = quant_cls(**init_params)
215	if parent is not None:
216	setattr(parent, layer_name, quant_layer)
217	else:
218	model = quant_layer
219
220	setattr(model, 'quant_mode', quant_mode)
221	return model
222
223
224	def smooth_quantize(model, quant_config: QuantConfig):

smooth_quantizeFunction · 0.85

get_init_paramsFunction · 0.85

named_modules_with_parentMethod · 0.80

no test coverage detected