(self, x, lora_runtime_params=None, all_reduce_params=None)
| 213 | self.quant_mode = quant_mode |
| 214 | |
| 215 | def forward(self, x, lora_runtime_params=None, all_reduce_params=None): |
| 216 | assert lora_runtime_params is None, "lora is not supported on SmoothQuantRowLinear now" |
| 217 | if self.quant_mode.has_act_static_scaling(): |
| 218 | per_token_scale = self.act_scale.value |
| 219 | else: |
| 220 | x, per_token_scale = x |
| 221 | x = smooth_quant_gemm(x, self.weight.value, per_token_scale, |
| 222 | self.per_channel_scale.value, |
| 223 | self.quant_mode.has_per_token_dynamic_scaling(), |
| 224 | self.quant_mode.has_per_channel_scaling(), |
| 225 | self.dtype) |
| 226 | |
| 227 | if self.tp_size > 1 and self.tp_group is not None: |
| 228 | need_bias = self.bias is not None |
| 229 | fuse_bias_into_all_reduce = need_bias and ( |
| 230 | all_reduce_params |
| 231 | is not None) and (all_reduce_params.fusion_op |
| 232 | == AllReduceFusionOp.RESIDUAL_RMS_NORM) |
| 233 | if fuse_bias_into_all_reduce: |
| 234 | all_reduce_params.bias = self.bias.value |
| 235 | x = allreduce(x, self.tp_group, all_reduce_params=all_reduce_params) |
| 236 | if need_bias and not fuse_bias_into_all_reduce: |
| 237 | x = x + self.bias.value |
| 238 | return x |
| 239 | |
| 240 | if self.bias is not None: |
| 241 | x = x + self.bias.value |
| 242 | |
| 243 | return x |
| 244 | |
| 245 | |
| 246 | class SmoothQuantLayerNorm(Module): |
nothing calls this directly
no test coverage detected