(self, x, lora_runtime_params=None, all_reduce_params=None)
| 668 | self.tllm_to_externel_key_dict = {"weight": ["weight", "weight_scale"]} |
| 669 | |
| 670 | def forward(self, x, lora_runtime_params=None, all_reduce_params=None): |
| 671 | assert lora_runtime_params is None, "lora is not supported on SmoothQuantRowLinear now" |
| 672 | x, per_token_scale = x |
| 673 | x = fp8_rowwise_gemm(x, self.weight.value, per_token_scale, |
| 674 | self.per_channel_scale.value, |
| 675 | self.quant_mode.has_fp8_rowwise(), |
| 676 | self.quant_mode.has_fp8_rowwise()) |
| 677 | |
| 678 | if self.tp_size > 1 and self.tp_group is not None: |
| 679 | need_bias = self.bias is not None |
| 680 | fuse_bias_into_all_reduce = need_bias and ( |
| 681 | all_reduce_params |
| 682 | is not None) and (all_reduce_params.fusion_op |
| 683 | == AllReduceFusionOp.RESIDUAL_RMS_NORM) |
| 684 | if fuse_bias_into_all_reduce: |
| 685 | all_reduce_params.bias = self.bias.value |
| 686 | x = allreduce(x, self.tp_group, all_reduce_params=all_reduce_params) |
| 687 | if need_bias and not fuse_bias_into_all_reduce: |
| 688 | x = x + self.bias.value |
| 689 | return x |
| 690 | |
| 691 | if self.bias is not None: |
| 692 | x = x + self.bias.value |
| 693 | |
| 694 | return x |
| 695 | |
| 696 | def postprocess(self, tllm_key, weights, **kwargs): |
| 697 | if "per_channel_scale" in tllm_key: |
nothing calls this directly
no test coverage detected