(self,
moe_config: MoeConfig,
hidden_size: int,
ffn_hidden_size: int,
hidden_act: str,
mapping: Mapping = Mapping(),
bias: bool = True,
dtype=None,
tp_group: List[int] = None,
tp_size: int = 1,
quant_mode=QuantMode(0),
use_all_reduce=True,
pre_quant_scale=False,
zero=False,
use_w4a8_awq=False,
use_int8_weight=False,
group_size: int = -1,
static_routing=False)
| 738 | class MixtureOfExperts(Module): |
| 739 | |
| 740 | def __init__(self, |
| 741 | moe_config: MoeConfig, |
| 742 | hidden_size: int, |
| 743 | ffn_hidden_size: int, |
| 744 | hidden_act: str, |
| 745 | mapping: Mapping = Mapping(), |
| 746 | bias: bool = True, |
| 747 | dtype=None, |
| 748 | tp_group: List[int] = None, |
| 749 | tp_size: int = 1, |
| 750 | quant_mode=QuantMode(0), |
| 751 | use_all_reduce=True, |
| 752 | pre_quant_scale=False, |
| 753 | zero=False, |
| 754 | use_w4a8_awq=False, |
| 755 | use_int8_weight=False, |
| 756 | group_size: int = -1, |
| 757 | static_routing=False): |
| 758 | super().__init__() |
| 759 | |
| 760 | self.moe_config = moe_config |
| 761 | self.num_experts = moe_config.num_experts |
| 762 | self.top_k = moe_config.top_k |
| 763 | |
| 764 | self.hidden_act = hidden_act |
| 765 | self.hidden_size = hidden_size |
| 766 | self.ffn_hidden_size = ffn_hidden_size |
| 767 | self.expert_inter_size = ffn_hidden_size |
| 768 | self.dtype = dtype |
| 769 | self.weight_dtype = dtype |
| 770 | self.tp_group = tp_group |
| 771 | self.tp_size = tp_size |
| 772 | self.mapping = mapping |
| 773 | self.quant_mode = quant_mode |
| 774 | self.bias = bias |
| 775 | self.use_all_reduce = use_all_reduce |
| 776 | self.zero = zero |
| 777 | self.pre_quant_scale = pre_quant_scale |
| 778 | self.use_w4a8_awq = use_w4a8_awq |
| 779 | self.use_int8_weight = use_int8_weight |
| 780 | self.group_size = group_size |
| 781 | |
| 782 | if self.use_int8_weight and self.group_size > 0: |
| 783 | raise NotImplementedError("INT8-GPTQ is not implemented for MoE.") |
| 784 | |
| 785 | self.static_routing = static_routing |
| 786 | |
| 787 | self.experts_per_node = self.num_experts |
| 788 | if self.mapping.has_moe_ep(): |
| 789 | if self.num_experts % self.mapping.moe_ep_size != 0: |
| 790 | raise ValueError( |
| 791 | f"MixtureOfExperts - Number of experts {self.num_experts} is not a multiple of EP size {self.mapping.moe_ep_size}" |
| 792 | ) |
| 793 | self.experts_per_node = self.experts_per_node // self.mapping.moe_ep_size |
| 794 | |
| 795 | if self.mapping.has_moe_tp(): |
| 796 | if self.ffn_hidden_size % self.mapping.moe_tp_size != 0: |
| 797 | raise ValueError( |
nothing calls this directly
no test coverage detected