MCPcopy Index your code
hub / github.com/NVIDIA/TensorRT-LLM / qserve_quantize

Function qserve_quantize

tensorrt_llm/quantization/quantize.py:498–536  ·  view source on GitHub ↗
(model, quant_config: QuantConfig)

Source from the content-addressed store, hash-verified

496
497# TODO: Duplicates smooth_quantize and quantize_layers
498def qserve_quantize(model, quant_config: QuantConfig):
499 quant_mode = quant_config.quant_mode
500 assert quant_config.quant_mode.is_qserve_w4a8()
501
502 quant_map = {
503 RmsNorm: QServeRmsNorm,
504 LayerNorm: QServeRmsNorm,
505 GatedMLP: QServeGatedMLP,
506 MLP: QServeMLP,
507 Attention: QServeAttention,
508 }
509
510 for name, layer, parent in model.named_modules_with_parent():
511 layer_name = name.rsplit('.', 1)[-1]
512 if layer_name in ['ln_f', 'ln_embed']:
513 continue
514
515 quant_cls = None
516 for cls in quant_map:
517 if isinstance(layer, cls):
518 quant_cls = quant_map[cls]
519 break
520
521 if quant_cls is None:
522 continue
523
524 init_params = get_init_params(layer, quant_cls)
525 init_params["quant_mode"] = quant_mode
526 if isinstance(layer, Attention):
527 init_params[
528 "num_attention_heads"] = layer.num_attention_heads * layer.tp_size
529 quant_layer = quant_cls(**init_params)
530 if parent is not None:
531 setattr(parent, layer_name, quant_layer)
532 else:
533 model = quant_layer
534
535 setattr(model, 'quant_mode', quant_mode)
536 return model
537
538
539def fp4_quantize(model, quant_config: QuantConfig):

Callers 1

quantizeFunction · 0.85

Calls 3

get_init_paramsFunction · 0.85
is_qserve_w4a8Method · 0.80

Tested by

no test coverage detected