MCPcopy
hub / github.com/NVIDIA/TensorRT-LLM / weight_only_groupwise_quantize

Function weight_only_groupwise_quantize

tensorrt_llm/quantization/quantize.py:134–167  ·  view source on GitHub ↗
(model,
                                   quant_config: QuantConfig,
                                   model_config=None)

Source from the content-addressed store, hash-verified

132
133
134def weight_only_groupwise_quantize(model,
135 quant_config: QuantConfig,
136 model_config=None):
137 assert quant_config.quant_mode.is_weight_only()
138
139 try:
140 model_cfg = model.config
141 except AttributeError:
142 model_cfg = model_config
143
144 quant_map = {
145 ColumnLinear: WeightOnlyGroupwiseQuantColumnLinear,
146 RowLinear: WeightOnlyGroupwiseQuantRowLinear,
147 MixtureOfExperts: MixtureOfExperts,
148 }
149
150 def preprocess_init_params(init_params, name, module):
151 init_params["group_size"] = quant_config.group_size
152 init_params["pre_quant_scale"] = quant_config.pre_quant_scale
153 init_params["zero"] = quant_config.has_zero_point
154 init_params[
155 "use_w4a8_awq"] = quant_config.quant_algo == QuantAlgo.W4A8_AWQ
156 init_params[
157 "use_int8_weight"] = quant_config.quant_algo == QuantAlgo.W8A16_GPTQ
158 if "tp_rank" in init_params:
159 init_params["tp_rank"] = model_cfg.mapping.tp_rank
160
161 model = quantize_layers(
162 model,
163 quant_config,
164 quant_map,
165 preprocess_init_params,
166 )
167 return model
168
169
170def smooth_quantize_ootb(

Callers 1

quantizeFunction · 0.85

Calls 2

quantize_layersFunction · 0.85
is_weight_onlyMethod · 0.80

Tested by

no test coverage detected