MCPcopy
hub / github.com/NVIDIA/TensorRT-LLM / QuantMode

Class QuantMode

tensorrt_llm/quantization/mode.py:65–471  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

63
64
65class QuantMode(IntFlag):
66 # [WARNING] KEEP BELOW DEFINITION IN SYNC WITH cpp/tensorrt_llm/common/quantization.h
67
68 # The weights are quantized to 4 bits.
69 INT4_WEIGHTS = auto()
70 # The weights are quantized to 8 bits.
71 INT8_WEIGHTS = auto()
72 # The activations are quantized.
73 ACTIVATIONS = auto()
74 # The method uses one scaling factor per channel. It's pre-computed (static) from the weights.
75 PER_CHANNEL = auto()
76 # The method uses one scaling factor per token. It's computed on-the-fly.
77 PER_TOKEN = auto()
78 # The method uses one scaling factor per group. It's pre-computed (static) from the weights.
79 PER_GROUP = auto()
80 # The KV cache is quantized in INT8.
81 INT8_KV_CACHE = auto()
82 # The KV cache is quantized in FP8.
83 FP8_KV_CACHE = auto()
84 # FP8 QDQ
85 FP8_QDQ = auto()
86 # FP8 rowwise
87 FP8_ROWWISE = auto()
88 # FP8 block scales for Deepseek
89 FP8_1x128_128x128 = auto()
90 # W4A8 qserve
91 W4A8_QSERVE = auto()
92 # FP4
93 NVFP4 = auto()
94 NVFP4_KV_CACHE = auto()
95 # W4A8 NVFP4
96 W4A8_NVFP4_FP8 = auto()
97 # W4A8 MXFP4
98 W4A8_MXFP4_FP8 = auto()
99 W4A8_MXFP4_MXFP8 = auto()
100 W4A16_MXFP4 = auto()
101
102 # The smallest power-of-two that is not used by a flag. Do not call auto() after that line.
103 COUNT = auto()
104
105 # Bitmask to detect if weights, activations or both are quantized.
106 WEIGHTS_AND_ACTIVATIONS = INT4_WEIGHTS | INT8_WEIGHTS | ACTIVATIONS
107 # The mask of all valid flags.
108 VALID_FLAGS = COUNT - 1
109
110 def __deepcopy__(self, memo):
111 return self
112
113 # All the bits set? You can restrict the test to the bits indicated by "mask".
114 def _all(self, bits, mask=VALID_FLAGS):
115 return (self & mask) == bits
116
117 # Is one of the bits of the mask set?
118 def _any(self, bits):
119 return (self & bits) != 0
120
121 def is_int8_weight_only(self):
122 return self._all(self.INT8_WEIGHTS, self.WEIGHTS_AND_ACTIVATIONS)

Callers 15

_construct_executionMethod · 0.90
forwardMethod · 0.90
test_mlp_comparisonMethod · 0.90
test_ootb_comparisonMethod · 0.90
create_sessionFunction · 0.90
test_gated_mlpMethod · 0.90
add_tllm_weightFunction · 0.90
__init__Method · 0.90
__init__Method · 0.90

Calls

no outgoing calls

Tested by 7

_construct_executionMethod · 0.72
forwardMethod · 0.72
test_mlp_comparisonMethod · 0.72
test_ootb_comparisonMethod · 0.72
test_gated_mlpMethod · 0.72