Method forward

tensorrt_llm/quantization/layers.py:1810–1829 · view source on GitHub ↗

(self, hidden_states, lora_layer_params=None)

Source from the content-addressed store, hash-verified

1808	self.register_parameter('clamp_val', None)
1809
1810	def forward(self, hidden_states, lora_layer_params=None):
1811	assert lora_layer_params is None, f"lora is not supported on {self.__class__.__name__} now"
1812	inter = self.fused_fc(hidden_states)
1813
1814	if self.hidden_act == 'silu':
1815	inter = ACT2FN['swiglu'](inter)
1816	elif self.hidden_act == 'gelu':
1817	inter = ACT2FN['geglu'](inter)
1818	else:
1819	raise NotImplementedError(
1820	f"Activation {self.hidden_act} not yet implemented for {self.__class__.__name__}."
1821	)
1822
1823	if self.quant_mode.has_fp8_rowwise():
1824	# Quantize per token outputs tuple:
1825	# quantized tensor and scaling factors per token
1826	clamp_val = None if self.clamp_val is None else self.clamp_val.value
1827	inter = quantize_fp8_per_token(inter, clamp_val)
1828	output = self.proj(inter)
1829	return output
1830
1831
1832	class Fp8RowwiseAttention(Module):

nothing calls this directly

quantize_fp8_per_tokenFunction · 0.85

has_fp8_rowwiseMethod · 0.45

no test coverage detected