MCPcopy
hub / github.com/NVIDIA/TensorRT-LLM / quantize_layers

Function quantize_layers

tensorrt_llm/quantization/quantize.py:27–100  ·  view source on GitHub ↗
(
    model,
    quant_config: QuantConfig,
    quant_map,
    preprocess_init_params=None,
)

Source from the content-addressed store, hash-verified

25
26
27def quantize_layers(
28 model,
29 quant_config: QuantConfig,
30 quant_map,
31 preprocess_init_params=None,
32):
33 exclude_modules = quant_config.exclude_modules
34 if exclude_modules is None:
35 exclude_modules = [
36 '*lm_head',
37 '*router',
38 '*vocab_embedding',
39 '*position_embedding',
40 '*block_embedding',
41 '*shared_expert_gate',
42 ]
43
44 for name, module, parent in model.named_modules_with_parent():
45 module_name = name.rsplit('.', 1)[-1]
46 is_excluded = False
47 quant_cls = None
48
49 # handle exclusion
50 for exclude_module in exclude_modules:
51 if fnmatch.fnmatchcase(name, exclude_module):
52 is_excluded = True
53 break
54
55 # MoE modules are quantized on their constructor, so they must always
56 # be re-created with the appropriate quant_mode. When excluded,
57 # re-create with quant_mode 0.
58 # We need to handle it specially, we may want to redesign MoE implementation
59 if isinstance(module, MixtureOfExperts):
60 quant_cls = type(module)
61 elif not is_excluded:
62 for cls in quant_map:
63 if isinstance(module, cls):
64 quant_cls = quant_map[cls]
65 break
66
67 if quant_cls:
68 init_params = get_init_params(module, quant_cls)
69 if isinstance(module, MixtureOfExperts):
70 if is_excluded:
71 quant_mode = QuantMode(0)
72 else:
73 quant_mode = quant_config.quant_mode
74 init_params["quant_mode"] = quant_mode
75
76 # Auto-detect pre_quant_scale based on quant_algo
77 # For AWQ-based quantization methods that use pre_quant_scale
78 if quant_config.quant_algo in [
79 QuantAlgo.W4A16_AWQ, QuantAlgo.NVFP4_AWQ,
80 QuantAlgo.W4A8_AWQ
81 ]:
82 init_params["pre_quant_scale"] = True
83 if "bias" in init_params and not isinstance(module,
84 MixtureOfExperts):

Callers 5

weight_only_quantizeFunction · 0.85
smooth_quantize_ootbFunction · 0.85
fp8_quantizeFunction · 0.85
fp4_quantizeFunction · 0.85

Calls 4

get_init_paramsFunction · 0.85
QuantModeClass · 0.85
preprocess_init_paramsFunction · 0.85

Tested by

no test coverage detected