(self, x, lora_runtime_params=None)
| 143 | self.quant_mode = quant_mode |
| 144 | |
| 145 | def forward(self, x, lora_runtime_params=None): |
| 146 | assert lora_runtime_params is None, "lora is not supported on SmoothQuantLinear now" |
| 147 | if self.quant_mode.has_act_static_scaling(): |
| 148 | per_token_scale = self.act_scale.value |
| 149 | else: |
| 150 | # If we are in SmoothQuant with dynamic activation scaling, |
| 151 | # input x has to be a tuple of int8 tensor and fp32 scaling factors |
| 152 | x, per_token_scale = x |
| 153 | x = smooth_quant_gemm(x, self.weight.value, per_token_scale, |
| 154 | self.per_channel_scale.value, |
| 155 | self.quant_mode.has_per_token_dynamic_scaling(), |
| 156 | self.quant_mode.has_per_channel_scaling(), |
| 157 | self.dtype) |
| 158 | |
| 159 | if self.bias is not None: |
| 160 | x = x + self.bias.value |
| 161 | |
| 162 | if self.gather_output and self.tp_size > 1 and self.tp_group is not None: |
| 163 | # [dim0, local_dim] -> [dim0 * tp_size, local_dim] --> [dim0, local_dim * tp_size] |
| 164 | x = allgather(x, self.tp_group, gather_dim=1) |
| 165 | |
| 166 | return x |
| 167 | |
| 168 | |
| 169 | SmoothQuantColumnLinear = SmoothQuantLinear |
nothing calls this directly
no test coverage detected