Method forward

tensorrt_llm/quantization/layers.py:1745–1757 · view source on GitHub ↗

(self, hidden_states, lora_layer_params=None)

Source from the content-addressed store, hash-verified

1743	quant_mode=quant_mode)
1744
1745	def forward(self, hidden_states, lora_layer_params=None):
1746	assert lora_layer_params is None, f"lora is not supported on {self.__class__.__name__} now"
1747	inter = self.fc(hidden_states)
1748	inter = ACT2FN[self.hidden_act](inter)
1749	gate = self.gate(hidden_states)
1750	inter_x_gate = inter * gate
1751	if self.quant_mode.has_fp8_rowwise():
1752	# Quantize per token outputs tuple:
1753	# quantized tensor and scaling factors per token
1754	clamp_val = None if self.clamp_val is None else self.clamp_val.value
1755	inter_x_gate = quantize_fp8_per_token(inter_x_gate, clamp_val)
1756	output = self.proj(inter_x_gate)
1757	return output
1758
1759
1760	class Fp8RowwiseFusedGatedMLP(Module):

nothing calls this directly

quantize_fp8_per_tokenFunction · 0.85

has_fp8_rowwiseMethod · 0.45

no test coverage detected