(
model,
quant_config: QuantConfig,
quant_map,
preprocess_init_params=None,
)
| 25 | |
| 26 | |
| 27 | def quantize_layers( |
| 28 | model, |
| 29 | quant_config: QuantConfig, |
| 30 | quant_map, |
| 31 | preprocess_init_params=None, |
| 32 | ): |
| 33 | exclude_modules = quant_config.exclude_modules |
| 34 | if exclude_modules is None: |
| 35 | exclude_modules = [ |
| 36 | '*lm_head', |
| 37 | '*router', |
| 38 | '*vocab_embedding', |
| 39 | '*position_embedding', |
| 40 | '*block_embedding', |
| 41 | '*shared_expert_gate', |
| 42 | ] |
| 43 | |
| 44 | for name, module, parent in model.named_modules_with_parent(): |
| 45 | module_name = name.rsplit('.', 1)[-1] |
| 46 | is_excluded = False |
| 47 | quant_cls = None |
| 48 | |
| 49 | # handle exclusion |
| 50 | for exclude_module in exclude_modules: |
| 51 | if fnmatch.fnmatchcase(name, exclude_module): |
| 52 | is_excluded = True |
| 53 | break |
| 54 | |
| 55 | # MoE modules are quantized on their constructor, so they must always |
| 56 | # be re-created with the appropriate quant_mode. When excluded, |
| 57 | # re-create with quant_mode 0. |
| 58 | # We need to handle it specially, we may want to redesign MoE implementation |
| 59 | if isinstance(module, MixtureOfExperts): |
| 60 | quant_cls = type(module) |
| 61 | elif not is_excluded: |
| 62 | for cls in quant_map: |
| 63 | if isinstance(module, cls): |
| 64 | quant_cls = quant_map[cls] |
| 65 | break |
| 66 | |
| 67 | if quant_cls: |
| 68 | init_params = get_init_params(module, quant_cls) |
| 69 | if isinstance(module, MixtureOfExperts): |
| 70 | if is_excluded: |
| 71 | quant_mode = QuantMode(0) |
| 72 | else: |
| 73 | quant_mode = quant_config.quant_mode |
| 74 | init_params["quant_mode"] = quant_mode |
| 75 | |
| 76 | # Auto-detect pre_quant_scale based on quant_algo |
| 77 | # For AWQ-based quantization methods that use pre_quant_scale |
| 78 | if quant_config.quant_algo in [ |
| 79 | QuantAlgo.W4A16_AWQ, QuantAlgo.NVFP4_AWQ, |
| 80 | QuantAlgo.W4A8_AWQ |
| 81 | ]: |
| 82 | init_params["pre_quant_scale"] = True |
| 83 | if "bias" in init_params and not isinstance(module, |
| 84 | MixtureOfExperts): |
no test coverage detected