Quantize model weights for CPU inference. If no model is specified, interactive mode will be activated.
(
model: Optional[str] = typer.Argument(
None,
help="Model name or path to quantize",
),
method: Optional[QuantMethod] = typer.Option(
None,
"--method",
"-m",
help="Quantization method",
),
output: Optional[Path] = typer.Option(
None,
"--output",
"-o",
help="Output path for quantized weights",
),
input_type: Optional[str] = typer.Option(
None,
"--input-type",
"-i",
help="Input weight type (fp8, fp16, bf16)",
),
cpu_threads: Optional[int] = typer.Option(
None,
"--cpu-threads",
help="Number of CPU threads for quantization",
),
numa_nodes: Optional[int] = typer.Option(
None,
"--numa-nodes",
help="Number of NUMA nodes",
),
no_merge: bool = typer.Option(
False,
"--no-merge",
help="Don't merge safetensor files",
),
gpu: bool = typer.Option(
False,
"--gpu",
help="Use GPU for conversion (faster)",
),
yes: bool = typer.Option(
False,
"--yes",
"-y",
help="Skip confirmation prompts",
),
)
| 35 | |
| 36 | |
| 37 | def quant( |
| 38 | model: Optional[str] = typer.Argument( |
| 39 | None, |
| 40 | help="Model name or path to quantize", |
| 41 | ), |
| 42 | method: Optional[QuantMethod] = typer.Option( |
| 43 | None, |
| 44 | "--method", |
| 45 | "-m", |
| 46 | help="Quantization method", |
| 47 | ), |
| 48 | output: Optional[Path] = typer.Option( |
| 49 | None, |
| 50 | "--output", |
| 51 | "-o", |
| 52 | help="Output path for quantized weights", |
| 53 | ), |
| 54 | input_type: Optional[str] = typer.Option( |
| 55 | None, |
| 56 | "--input-type", |
| 57 | "-i", |
| 58 | help="Input weight type (fp8, fp16, bf16)", |
| 59 | ), |
| 60 | cpu_threads: Optional[int] = typer.Option( |
| 61 | None, |
| 62 | "--cpu-threads", |
| 63 | help="Number of CPU threads for quantization", |
| 64 | ), |
| 65 | numa_nodes: Optional[int] = typer.Option( |
| 66 | None, |
| 67 | "--numa-nodes", |
| 68 | help="Number of NUMA nodes", |
| 69 | ), |
| 70 | no_merge: bool = typer.Option( |
| 71 | False, |
| 72 | "--no-merge", |
| 73 | help="Don't merge safetensor files", |
| 74 | ), |
| 75 | gpu: bool = typer.Option( |
| 76 | False, |
| 77 | "--gpu", |
| 78 | help="Use GPU for conversion (faster)", |
| 79 | ), |
| 80 | yes: bool = typer.Option( |
| 81 | False, |
| 82 | "--yes", |
| 83 | "-y", |
| 84 | help="Skip confirmation prompts", |
| 85 | ), |
| 86 | ) -> None: |
| 87 | """Quantize model weights for CPU inference. |
| 88 | |
| 89 | If no model is specified, interactive mode will be activated. |
| 90 | """ |
| 91 | settings = get_settings() |
| 92 | |
| 93 | # Check if we should use interactive mode |
| 94 | # Interactive mode triggers when: no model, or missing critical parameters |
nothing calls this directly
no test coverage detected