(model, quant_config: QuantConfig)
| 496 | |
| 497 | # TODO: Duplicates smooth_quantize and quantize_layers |
| 498 | def qserve_quantize(model, quant_config: QuantConfig): |
| 499 | quant_mode = quant_config.quant_mode |
| 500 | assert quant_config.quant_mode.is_qserve_w4a8() |
| 501 | |
| 502 | quant_map = { |
| 503 | RmsNorm: QServeRmsNorm, |
| 504 | LayerNorm: QServeRmsNorm, |
| 505 | GatedMLP: QServeGatedMLP, |
| 506 | MLP: QServeMLP, |
| 507 | Attention: QServeAttention, |
| 508 | } |
| 509 | |
| 510 | for name, layer, parent in model.named_modules_with_parent(): |
| 511 | layer_name = name.rsplit('.', 1)[-1] |
| 512 | if layer_name in ['ln_f', 'ln_embed']: |
| 513 | continue |
| 514 | |
| 515 | quant_cls = None |
| 516 | for cls in quant_map: |
| 517 | if isinstance(layer, cls): |
| 518 | quant_cls = quant_map[cls] |
| 519 | break |
| 520 | |
| 521 | if quant_cls is None: |
| 522 | continue |
| 523 | |
| 524 | init_params = get_init_params(layer, quant_cls) |
| 525 | init_params["quant_mode"] = quant_mode |
| 526 | if isinstance(layer, Attention): |
| 527 | init_params[ |
| 528 | "num_attention_heads"] = layer.num_attention_heads * layer.tp_size |
| 529 | quant_layer = quant_cls(**init_params) |
| 530 | if parent is not None: |
| 531 | setattr(parent, layer_name, quant_layer) |
| 532 | else: |
| 533 | model = quant_layer |
| 534 | |
| 535 | setattr(model, 'quant_mode', quant_mode) |
| 536 | return model |
| 537 | |
| 538 | |
| 539 | def fp4_quantize(model, quant_config: QuantConfig): |
no test coverage detected