(quant_config: Optional[QuantConfig] = None)
| 2037 | |
| 2038 | |
| 2039 | def get_quant_method(quant_config: Optional[QuantConfig] = None): |
| 2040 | if quant_config is None or not quant_config.layer_quant_mode.has_any_quant( |
| 2041 | exclude_kv_cache=True): |
| 2042 | return UnquantizedLinearMethod() |
| 2043 | if quant_config.layer_quant_mode.has_fp8_qdq(): |
| 2044 | return FP8QDQLinearMethod() |
| 2045 | if quant_config.layer_quant_mode.has_fp8_rowwise(): |
| 2046 | return FP8RowwiseLinearMethod() |
| 2047 | if quant_config.layer_quant_mode.has_fp8_block_scales(): |
| 2048 | return FP8BlockScalesLinearMethod() |
| 2049 | if quant_config.layer_quant_mode.has_nvfp4(): |
| 2050 | return NVFP4LinearMethod() |
| 2051 | if quant_config.layer_quant_mode.has_w4a8_nvfp4_fp8(): |
| 2052 | return W4A8NVFP4FP8LinearMethod() |
| 2053 | if quant_config.layer_quant_mode.has_w4a8_mxfp4_fp8(): |
| 2054 | return W4A8MXFP4FP8LinearMethod() |
| 2055 | if quant_config.layer_quant_mode.is_weight_only( |
| 2056 | ) and not quant_config.layer_quant_mode.has_per_group_scaling(): |
| 2057 | return WeightOnlyQuantLinearMethod() |
| 2058 | if quant_config.layer_quant_mode.is_int4_weight_only_per_group( |
| 2059 | ) and quant_config.quant_algo == QuantAlgo.W4A16_AWQ: |
| 2060 | return W4A16_AWQ_LinearMethod() |
| 2061 | if quant_config.layer_quant_mode.is_int4_weight_only_per_group( |
| 2062 | ) and quant_config.quant_algo == QuantAlgo.W4A8_AWQ: |
| 2063 | return W4A8_AWQ_LinearMethod() |
| 2064 | if quant_config.layer_quant_mode.has_w4a8_mxfp4_mxfp8(): |
| 2065 | return W4A8MXFP4MXFP8LinearMethod() |
| 2066 | raise ValueError(f'unsupported quant mode: {quant_config.quant_mode}') |
| 2067 | |
| 2068 | |
| 2069 | class Linear(nn.Module): |
no test coverage detected