hub / github.com/kvcache-ai/ktransformers / quant

Function quant

kt-kernel/python/cli/commands/quant.py:37–475 · view source on GitHub ↗

Quantize model weights for CPU inference. If no model is specified, interactive mode will be activated.

(
    model: Optional[str] = typer.Argument(
        None,
        help="Model name or path to quantize",
    ),
    method: Optional[QuantMethod] = typer.Option(
        None,
        "--method",
        "-m",
        help="Quantization method",
    ),
    output: Optional[Path] = typer.Option(
        None,
        "--output",
        "-o",
        help="Output path for quantized weights",
    ),
    input_type: Optional[str] = typer.Option(
        None,
        "--input-type",
        "-i",
        help="Input weight type (fp8, fp16, bf16)",
    ),
    cpu_threads: Optional[int] = typer.Option(
        None,
        "--cpu-threads",
        help="Number of CPU threads for quantization",
    ),
    numa_nodes: Optional[int] = typer.Option(
        None,
        "--numa-nodes",
        help="Number of NUMA nodes",
    ),
    no_merge: bool = typer.Option(
        False,
        "--no-merge",
        help="Don't merge safetensor files",
    ),
    gpu: bool = typer.Option(
        False,
        "--gpu",
        help="Use GPU for conversion (faster)",
    ),
    yes: bool = typer.Option(
        False,
        "--yes",
        "-y",
        help="Skip confirmation prompts",
    ),
)

Source from the content-addressed store, hash-verified

35
36
37	def quant(
38	model: Optional[str] = typer.Argument(
39	None,
40	help="Model name or path to quantize",
41	),
42	method: Optional[QuantMethod] = typer.Option(
43	None,
44	"--method",
45	"-m",
46	help="Quantization method",
47	),
48	output: Optional[Path] = typer.Option(
49	None,
50	"--output",
51	"-o",
52	help="Output path for quantized weights",
53	),
54	input_type: Optional[str] = typer.Option(
55	None,
56	"--input-type",
57	"-i",
58	help="Input weight type (fp8, fp16, bf16)",
59	),
60	cpu_threads: Optional[int] = typer.Option(
61	None,
62	"--cpu-threads",
63	help="Number of CPU threads for quantization",
64	),
65	numa_nodes: Optional[int] = typer.Option(
66	None,
67	"--numa-nodes",
68	help="Number of NUMA nodes",
69	),
70	no_merge: bool = typer.Option(
71	False,
72	"--no-merge",
73	help="Don't merge safetensor files",
74	),
75	gpu: bool = typer.Option(
76	False,
77	"--gpu",
78	help="Use GPU for conversion (faster)",
79	),
80	yes: bool = typer.Option(
81	False,
82	"--yes",
83	"-y",
84	help="Skip confirmation prompts",
85	),
86	) -> None:
87	"""Quantize model weights for CPU inference.
88
89	If no model is specified, interactive mode will be activated.
90	"""
91	settings = get_settings()
92
93	# Check if we should use interactive mode
94	# Interactive mode triggers when: no model, or missing critical parameters

Callers

nothing calls this directly

Calls 15

find_by_pathMethod · 0.95

suggest_nameMethod · 0.95

add_modelMethod · 0.95

get_settingsFunction · 0.90

tFunction · 0.90

interactive_quant_configFunction · 0.90

print_successFunction · 0.90

print_errorFunction · 0.90

UserModelRegistryClass · 0.90

pre_operation_verificationFunction · 0.90

is_amx_weightsFunction · 0.90

analyze_moe_modelFunction · 0.90

Tested by

no test coverage detected