hub / github.com/NVIDIA/TensorRT-LLM / __init__

Method init

tensorrt_llm/layers/moe.py:740–869 · view source on GitHub ↗

(self,
                 moe_config: MoeConfig,
                 hidden_size: int,
                 ffn_hidden_size: int,
                 hidden_act: str,
                 mapping: Mapping = Mapping(),
                 bias: bool = True,
                 dtype=None,
                 tp_group: List[int] = None,
                 tp_size: int = 1,
                 quant_mode=QuantMode(0),
                 use_all_reduce=True,
                 pre_quant_scale=False,
                 zero=False,
                 use_w4a8_awq=False,
                 use_int8_weight=False,
                 group_size: int = -1,
                 static_routing=False)

Source from the content-addressed store, hash-verified

738	class MixtureOfExperts(Module):
739
740	def __init__(self,
741	moe_config: MoeConfig,
742	hidden_size: int,
743	ffn_hidden_size: int,
744	hidden_act: str,
745	mapping: Mapping = Mapping(),
746	bias: bool = True,
747	dtype=None,
748	tp_group: List[int] = None,
749	tp_size: int = 1,
750	quant_mode=QuantMode(0),
751	use_all_reduce=True,
752	pre_quant_scale=False,
753	zero=False,
754	use_w4a8_awq=False,
755	use_int8_weight=False,
756	group_size: int = -1,
757	static_routing=False):
758	super().__init__()
759
760	self.moe_config = moe_config
761	self.num_experts = moe_config.num_experts
762	self.top_k = moe_config.top_k
763
764	self.hidden_act = hidden_act
765	self.hidden_size = hidden_size
766	self.ffn_hidden_size = ffn_hidden_size
767	self.expert_inter_size = ffn_hidden_size
768	self.dtype = dtype
769	self.weight_dtype = dtype
770	self.tp_group = tp_group
771	self.tp_size = tp_size
772	self.mapping = mapping
773	self.quant_mode = quant_mode
774	self.bias = bias
775	self.use_all_reduce = use_all_reduce
776	self.zero = zero
777	self.pre_quant_scale = pre_quant_scale
778	self.use_w4a8_awq = use_w4a8_awq
779	self.use_int8_weight = use_int8_weight
780	self.group_size = group_size
781
782	if self.use_int8_weight and self.group_size > 0:
783	raise NotImplementedError("INT8-GPTQ is not implemented for MoE.")
784
785	self.static_routing = static_routing
786
787	self.experts_per_node = self.num_experts
788	if self.mapping.has_moe_ep():
789	if self.num_experts % self.mapping.moe_ep_size != 0:
790	raise ValueError(
791	f"MixtureOfExperts - Number of experts {self.num_experts} is not a multiple of EP size {self.mapping.moe_ep_size}"
792	)
793	self.experts_per_node = self.experts_per_node // self.mapping.moe_ep_size
794
795	if self.mapping.has_moe_tp():
796	if self.ffn_hidden_size % self.mapping.moe_tp_size != 0:
797	raise ValueError(

Callers

nothing calls this directly

Calls 13

init_expertsMethod · 0.95

MappingClass · 0.85

QuantModeClass · 0.85

RowLinearClass · 0.85

has_moe_epMethod · 0.80

has_moe_tpMethod · 0.80

is_weight_onlyMethod · 0.80

ep_expertsMethod · 0.80

__init__Method · 0.45

has_fp8_rowwiseMethod · 0.45

has_fp8_qdqMethod · 0.45

updateMethod · 0.45

Tested by

no test coverage detected

Method __init__

Source from the content-addressed store, hash-verified

Callers

Calls 13

Tested by

Method init