(model, quant_mode)
| 185 | |
| 186 | |
| 187 | def smooth_quantize_plugin(model, quant_mode): |
| 188 | quant_map = { |
| 189 | RmsNorm: SmoothQuantRmsNorm, |
| 190 | LayerNorm: SmoothQuantLayerNorm, |
| 191 | GatedMLP: SmoothQuantGatedMLP, |
| 192 | MLP: SmoothQuantMLP, |
| 193 | Attention: SmoothQuantAttention, |
| 194 | } |
| 195 | for name, layer, parent in model.named_modules_with_parent(): |
| 196 | layer_name = name.rsplit('.', 1)[-1] |
| 197 | if layer_name in ['ln_f', 'ln_embed']: |
| 198 | continue |
| 199 | |
| 200 | quant_cls = None |
| 201 | for cls in quant_map: |
| 202 | if isinstance(layer, cls): |
| 203 | quant_cls = quant_map[cls] |
| 204 | break |
| 205 | |
| 206 | if quant_cls is None: |
| 207 | continue |
| 208 | |
| 209 | init_params = get_init_params(layer, quant_cls) |
| 210 | init_params["quant_mode"] = quant_mode |
| 211 | if isinstance(layer, Attention): |
| 212 | init_params[ |
| 213 | "num_attention_heads"] = layer.num_attention_heads * layer.tp_size |
| 214 | quant_layer = quant_cls(**init_params) |
| 215 | if parent is not None: |
| 216 | setattr(parent, layer_name, quant_layer) |
| 217 | else: |
| 218 | model = quant_layer |
| 219 | |
| 220 | setattr(model, 'quant_mode', quant_mode) |
| 221 | return model |
| 222 | |
| 223 | |
| 224 | def smooth_quantize(model, quant_config: QuantConfig): |
no test coverage detected