(model,
quant_config: QuantConfig,
model_config=None)
| 132 | |
| 133 | |
| 134 | def weight_only_groupwise_quantize(model, |
| 135 | quant_config: QuantConfig, |
| 136 | model_config=None): |
| 137 | assert quant_config.quant_mode.is_weight_only() |
| 138 | |
| 139 | try: |
| 140 | model_cfg = model.config |
| 141 | except AttributeError: |
| 142 | model_cfg = model_config |
| 143 | |
| 144 | quant_map = { |
| 145 | ColumnLinear: WeightOnlyGroupwiseQuantColumnLinear, |
| 146 | RowLinear: WeightOnlyGroupwiseQuantRowLinear, |
| 147 | MixtureOfExperts: MixtureOfExperts, |
| 148 | } |
| 149 | |
| 150 | def preprocess_init_params(init_params, name, module): |
| 151 | init_params["group_size"] = quant_config.group_size |
| 152 | init_params["pre_quant_scale"] = quant_config.pre_quant_scale |
| 153 | init_params["zero"] = quant_config.has_zero_point |
| 154 | init_params[ |
| 155 | "use_w4a8_awq"] = quant_config.quant_algo == QuantAlgo.W4A8_AWQ |
| 156 | init_params[ |
| 157 | "use_int8_weight"] = quant_config.quant_algo == QuantAlgo.W8A16_GPTQ |
| 158 | if "tp_rank" in init_params: |
| 159 | init_params["tp_rank"] = model_cfg.mapping.tp_rank |
| 160 | |
| 161 | model = quantize_layers( |
| 162 | model, |
| 163 | quant_config, |
| 164 | quant_map, |
| 165 | preprocess_init_params, |
| 166 | ) |
| 167 | return model |
| 168 | |
| 169 | |
| 170 | def smooth_quantize_ootb( |
no test coverage detected