MCPcopy Index your code
hub / github.com/NVIDIA/TensorRT-LLM / get_quant_method

Function get_quant_method

tensorrt_llm/_torch/modules/linear.py:2039–2066  ·  view source on GitHub ↗
(quant_config: Optional[QuantConfig] = None)

Source from the content-addressed store, hash-verified

2037
2038
2039def get_quant_method(quant_config: Optional[QuantConfig] = None):
2040 if quant_config is None or not quant_config.layer_quant_mode.has_any_quant(
2041 exclude_kv_cache=True):
2042 return UnquantizedLinearMethod()
2043 if quant_config.layer_quant_mode.has_fp8_qdq():
2044 return FP8QDQLinearMethod()
2045 if quant_config.layer_quant_mode.has_fp8_rowwise():
2046 return FP8RowwiseLinearMethod()
2047 if quant_config.layer_quant_mode.has_fp8_block_scales():
2048 return FP8BlockScalesLinearMethod()
2049 if quant_config.layer_quant_mode.has_nvfp4():
2050 return NVFP4LinearMethod()
2051 if quant_config.layer_quant_mode.has_w4a8_nvfp4_fp8():
2052 return W4A8NVFP4FP8LinearMethod()
2053 if quant_config.layer_quant_mode.has_w4a8_mxfp4_fp8():
2054 return W4A8MXFP4FP8LinearMethod()
2055 if quant_config.layer_quant_mode.is_weight_only(
2056 ) and not quant_config.layer_quant_mode.has_per_group_scaling():
2057 return WeightOnlyQuantLinearMethod()
2058 if quant_config.layer_quant_mode.is_int4_weight_only_per_group(
2059 ) and quant_config.quant_algo == QuantAlgo.W4A16_AWQ:
2060 return W4A16_AWQ_LinearMethod()
2061 if quant_config.layer_quant_mode.is_int4_weight_only_per_group(
2062 ) and quant_config.quant_algo == QuantAlgo.W4A8_AWQ:
2063 return W4A8_AWQ_LinearMethod()
2064 if quant_config.layer_quant_mode.has_w4a8_mxfp4_mxfp8():
2065 return W4A8MXFP4MXFP8LinearMethod()
2066 raise ValueError(f'unsupported quant mode: {quant_config.quant_mode}')
2067
2068
2069class Linear(nn.Module):

Callers 1

get_quant_methodMethod · 0.85

Tested by

no test coverage detected