| 1244 | self.register_parameter('quantization_scaling_factor', None) |
| 1245 | |
| 1246 | def forward(self, hidden_states, lora_layer_params=None): |
| 1247 | |
| 1248 | inter = self.fc(hidden_states) |
| 1249 | inter = ACT2FN[self.hidden_act](inter) |
| 1250 | value = cast(self.proj.smoother.value, inter.dtype) |
| 1251 | inter = inter / value |
| 1252 | if self.quant_mode.has_act_and_weight_quant(): |
| 1253 | if self.quant_mode.has_act_static_scaling(): |
| 1254 | # Avoid quantization layers as it breaks int8 plugins |
| 1255 | inter = quantize_tensor(inter, |
| 1256 | self.quantization_scaling_factor.value) |
| 1257 | else: |
| 1258 | # Quantize per token outputs tuple: |
| 1259 | # quantized tensor and scaling factors per token |
| 1260 | inter = quantize_per_token(inter) |
| 1261 | output = self.proj(inter) |
| 1262 | return output |
| 1263 | |
| 1264 | |
| 1265 | class Int8SmoothQuantRowLinear(RowLinear): |