hub / github.com/NVIDIA/TensorRT-LLM / __init__

Method init

tensorrt_llm/quantization/layers.py:899–931 · view source on GitHub ↗

(
            self,
            num_embeddings: int,
            embedding_dim: int,
            dtype: Optional[str] = None,
            tp_size: int = 1,
            tp_group: Optional[list] = None,
            sharding_dim: int = 0,
            tp_rank: Optional[int] = None,
            quant_mode=QuantMode.use_weight_only(),
    )

Source from the content-addressed store, hash-verified

897	class WeightOnlyQuantEmbedding(Embedding):
898
899	def __init__(
900	self,
901	num_embeddings: int,
902	embedding_dim: int,
903	dtype: Optional[str] = None,
904	tp_size: int = 1,
905	tp_group: Optional[list] = None,
906	sharding_dim: int = 0,
907	tp_rank: Optional[int] = None,
908	quant_mode=QuantMode.use_weight_only(),
909	):
910	super().__init__(
911	num_embeddings,
912	embedding_dim,
913	dtype, # dtype,
914	tp_size,
915	tp_group,
916	sharding_dim,
917	tp_rank)
918	# only support int8 wo now
919	# TODO support int4 wo
920	self.quant_mode = quant_mode
921	self.per_token_scale = Parameter(shape=(self.num_embeddings, ),
922	dtype=dtype)
923
924	if sharding_dim == 1:
925	self.weight = Parameter(shape=(self.num_embeddings,
926	self.embedding_dim // self.tp_size),
927	dtype="int8")
928	elif sharding_dim == 0:
929	self.weight = Parameter(shape=(math.ceil(
930	self.num_embeddings / self.tp_size), self.embedding_dim),
931	dtype="int8")
932
933	def forward(self, x):
934	result = embedding(x,

Callers

nothing calls this directly

Calls 3

ParameterClass · 0.85

use_weight_onlyMethod · 0.80

__init__Method · 0.45

Tested by

no test coverage detected

Method __init__

Source from the content-addressed store, hash-verified

Callers

Calls 3

Tested by

Method init