(self, hidden_states, lora_layer_params=None)
| 2432 | self.register_parameter('quantization_scaling_factor', None) |
| 2433 | |
| 2434 | def forward(self, hidden_states, lora_layer_params=None): |
| 2435 | assert lora_layer_params is None, f"lora is not supported on {self.__class__.__name__} now" |
| 2436 | inter = self.fc(hidden_states) |
| 2437 | inter = ACT2FN[self.hidden_act](inter) |
| 2438 | gate = self.gate(hidden_states) |
| 2439 | inter_x_gate = inter * gate |
| 2440 | smoother = cast(self.proj.smoother.value, self.dtype) |
| 2441 | inter_x_gate = inter_x_gate / smoother |
| 2442 | if self.quant_mode.has_act_and_weight_quant(): |
| 2443 | if self.quant_mode.has_act_static_scaling(): |
| 2444 | # Avoid quantization layers as it breaks int8 plugins |
| 2445 | inter_x_gate = quantize_tensor( |
| 2446 | inter_x_gate, self.quantization_scaling_factor.value) |
| 2447 | else: |
| 2448 | # Quantize per token outputs tuple: |
| 2449 | # quantized tensor and scaling factors per token |
| 2450 | inter_x_gate = quantize_per_token(inter_x_gate) |
| 2451 | |
| 2452 | output = self.proj(inter_x_gate) |
| 2453 | return output |
| 2454 | |
| 2455 | |
| 2456 | class SmoothQuantAttention(Module): |
nothing calls this directly
no test coverage detected