Method forward

tensorrt_llm/quantization/layers.py:215–243 · view source on GitHub ↗

(self, x, lora_runtime_params=None, all_reduce_params=None)

Source from the content-addressed store, hash-verified

213	self.quant_mode = quant_mode
214
215	def forward(self, x, lora_runtime_params=None, all_reduce_params=None):
216	assert lora_runtime_params is None, "lora is not supported on SmoothQuantRowLinear now"
217	if self.quant_mode.has_act_static_scaling():
218	per_token_scale = self.act_scale.value
219	else:
220	x, per_token_scale = x
221	x = smooth_quant_gemm(x, self.weight.value, per_token_scale,
222	self.per_channel_scale.value,
223	self.quant_mode.has_per_token_dynamic_scaling(),
224	self.quant_mode.has_per_channel_scaling(),
225	self.dtype)
226
227	if self.tp_size > 1 and self.tp_group is not None:
228	need_bias = self.bias is not None
229	fuse_bias_into_all_reduce = need_bias and (
230	all_reduce_params
231	is not None) and (all_reduce_params.fusion_op
232	== AllReduceFusionOp.RESIDUAL_RMS_NORM)
233	if fuse_bias_into_all_reduce:
234	all_reduce_params.bias = self.bias.value
235	x = allreduce(x, self.tp_group, all_reduce_params=all_reduce_params)
236	if need_bias and not fuse_bias_into_all_reduce:
237	x = x + self.bias.value
238	return x
239
240	if self.bias is not None:
241	x = x + self.bias.value
242
243	return x
244
245
246	class SmoothQuantLayerNorm(Module):

nothing calls this directly

smooth_quant_gemmFunction · 0.85

has_act_static_scalingMethod · 0.80

has_per_token_dynamic_scalingMethod · 0.80

has_per_channel_scalingMethod · 0.80

allreduceFunction · 0.50

no test coverage detected