Actual implementation of run command.
(
model: Optional[str],
host: Optional[str],
port: Optional[int],
gpu_experts: Optional[int],
cpu_threads: Optional[int],
numa_nodes: Optional[tuple[int, ...]],
tensor_parallel_size: Optional[int],
model_path: Optional[Path],
weights_path: Optional[Path],
kt_method: Optional[str],
kt_gpu_prefill_threshold: Optional[int],
attention_backend: Optional[str],
max_total_tokens: Optional[int],
max_running_requests: Optional[int],
chunked_prefill_size: Optional[int],
mem_fraction_static: Optional[float],
watchdog_timeout: Optional[int],
served_model_name: Optional[str],
disable_shared_experts_fusion: Optional[bool],
quantize: bool,
advanced: bool,
dry_run: bool,
extra_cli_args: list[str],
)
| 168 | |
| 169 | |
| 170 | def _run_impl( |
| 171 | model: Optional[str], |
| 172 | host: Optional[str], |
| 173 | port: Optional[int], |
| 174 | gpu_experts: Optional[int], |
| 175 | cpu_threads: Optional[int], |
| 176 | numa_nodes: Optional[tuple[int, ...]], |
| 177 | tensor_parallel_size: Optional[int], |
| 178 | model_path: Optional[Path], |
| 179 | weights_path: Optional[Path], |
| 180 | kt_method: Optional[str], |
| 181 | kt_gpu_prefill_threshold: Optional[int], |
| 182 | attention_backend: Optional[str], |
| 183 | max_total_tokens: Optional[int], |
| 184 | max_running_requests: Optional[int], |
| 185 | chunked_prefill_size: Optional[int], |
| 186 | mem_fraction_static: Optional[float], |
| 187 | watchdog_timeout: Optional[int], |
| 188 | served_model_name: Optional[str], |
| 189 | disable_shared_experts_fusion: Optional[bool], |
| 190 | quantize: bool, |
| 191 | advanced: bool, |
| 192 | dry_run: bool, |
| 193 | extra_cli_args: list[str], |
| 194 | ) -> None: |
| 195 | """Actual implementation of run command.""" |
| 196 | # Check if SGLang is installed before proceeding |
| 197 | from kt_kernel.cli.utils.sglang_checker import ( |
| 198 | check_sglang_installation, |
| 199 | check_sglang_kt_kernel_support, |
| 200 | print_sglang_install_instructions, |
| 201 | print_sglang_kt_kernel_instructions, |
| 202 | ) |
| 203 | |
| 204 | sglang_info = check_sglang_installation() |
| 205 | if not sglang_info["installed"]: |
| 206 | console.print() |
| 207 | print_error(t("sglang_not_found")) |
| 208 | console.print() |
| 209 | print_sglang_install_instructions() |
| 210 | raise typer.Exit(1) |
| 211 | |
| 212 | # Check if SGLang supports kt-kernel (has --kt-gpu-prefill-token-threshold parameter) |
| 213 | kt_kernel_support = check_sglang_kt_kernel_support() |
| 214 | if not kt_kernel_support["supported"]: |
| 215 | console.print() |
| 216 | print_error(t("sglang_kt_kernel_not_supported")) |
| 217 | console.print() |
| 218 | print_sglang_kt_kernel_instructions() |
| 219 | raise typer.Exit(1) |
| 220 | |
| 221 | settings = get_settings() |
| 222 | user_registry = UserModelRegistry() |
| 223 | |
| 224 | # Check if we should use interactive mode |
| 225 | # Interactive mode triggers when: |
| 226 | # 1. No model specified, OR |
| 227 | # 2. Model specified but missing critical parameters (gpu_experts, tensor_parallel_size, etc.) |
no test coverage detected