(
self,
in_features,
out_features,
bias=True,
dtype=None,
tp_group=None,
tp_size=1,
tp_rank=0,
quant_mode=QuantMode.use_weight_only(),
prefer_managed_weight=True,
is_expert=False,
)
| 801 | class WeightOnlyQuantRowLinear(RowLinear): |
| 802 | |
| 803 | def __init__( |
| 804 | self, |
| 805 | in_features, |
| 806 | out_features, |
| 807 | bias=True, |
| 808 | dtype=None, |
| 809 | tp_group=None, |
| 810 | tp_size=1, |
| 811 | tp_rank=0, |
| 812 | quant_mode=QuantMode.use_weight_only(), |
| 813 | prefer_managed_weight=True, |
| 814 | is_expert=False, |
| 815 | ): |
| 816 | multiple = 64 * tp_size |
| 817 | self.is_padded = False |
| 818 | if in_features % multiple > 0: |
| 819 | in_features = math.ceil(in_features / multiple) * multiple |
| 820 | self.is_padded = True |
| 821 | if out_features % multiple > 0: |
| 822 | out_features = math.ceil(out_features / multiple) * multiple |
| 823 | self.is_padded = True |
| 824 | |
| 825 | super().__init__(in_features, |
| 826 | out_features, |
| 827 | bias=bias, |
| 828 | dtype=dtype, |
| 829 | tp_group=tp_group, |
| 830 | tp_size=tp_size, |
| 831 | prefer_managed_weight=prefer_managed_weight, |
| 832 | is_expert=is_expert) |
| 833 | if quant_mode.is_int8_weight_only(): |
| 834 | self.weight_only_quant_mode = 1 |
| 835 | elif quant_mode.is_int4_weight_only(): |
| 836 | self.weight_only_quant_mode = 2 |
| 837 | #we use a fake tensor with data_type = int8 |
| 838 | self.weight = Parameter(shape=(self.in_features, |
| 839 | int(self.out_features / |
| 840 | self.weight_only_quant_mode)), |
| 841 | dtype="int8", |
| 842 | prefer_managed=prefer_managed_weight) |
| 843 | self.per_channel_scale = Parameter(shape=(self.out_features, ), |
| 844 | dtype=dtype) |
| 845 | self.tp_rank = tp_rank |
| 846 | if self.is_padded: |
| 847 | self.tp_dim = -1 |
| 848 | self.quant_mode = quant_mode |
| 849 | |
| 850 | def forward(self, x, lora_runtime_params=None, all_reduce_params=None): |
| 851 | hidden_state = x |
nothing calls this directly
no test coverage detected