Method forward

tensorrt_llm/quantization/layers.py:1246–1262 · view source on GitHub ↗

(self, hidden_states, lora_layer_params=None)

Source from the content-addressed store, hash-verified

1244	self.register_parameter('quantization_scaling_factor', None)
1245
1246	def forward(self, hidden_states, lora_layer_params=None):
1247
1248	inter = self.fc(hidden_states)
1249	inter = ACT2FN[self.hidden_act](inter)
1250	value = cast(self.proj.smoother.value, inter.dtype)
1251	inter = inter / value
1252	if self.quant_mode.has_act_and_weight_quant():
1253	if self.quant_mode.has_act_static_scaling():
1254	# Avoid quantization layers as it breaks int8 plugins
1255	inter = quantize_tensor(inter,
1256	self.quantization_scaling_factor.value)
1257	else:
1258	# Quantize per token outputs tuple:
1259	# quantized tensor and scaling factors per token
1260	inter = quantize_per_token(inter)
1261	output = self.proj(inter)
1262	return output
1263
1264
1265	class Int8SmoothQuantRowLinear(RowLinear):

test_mlp_smooth_quantMethod · 0.95

castFunction · 0.85

quantize_tensorFunction · 0.85

quantize_per_tokenFunction · 0.85

has_act_and_weight_quantMethod · 0.80

has_act_static_scalingMethod · 0.80

test_mlp_smooth_quantMethod · 0.76