Serves a vLLM model from command line. Args: model_path: The path to the vLLM model. It can be either a local path or a Hugging Face model ID. port: The port to serve the model on. startup_timeout: The timeout for the server to start. terminate_timeout: The timeo
(
model_path: str,
port: int,
startup_timeout: float = 300.0,
terminate_timeout: float = 10.0,
max_model_len: int = 32768,
gpu_memory_utilization: float = 0.7,
quantization: Optional[str] = "bitsandbytes",
auto_tool_choice: bool = True,
tool_call_parser: Optional[str] = "hermes",
)
| 59 | |
| 60 | @contextmanager |
| 61 | def vllm_server( |
| 62 | model_path: str, |
| 63 | port: int, |
| 64 | startup_timeout: float = 300.0, |
| 65 | terminate_timeout: float = 10.0, |
| 66 | max_model_len: int = 32768, |
| 67 | gpu_memory_utilization: float = 0.7, |
| 68 | quantization: Optional[str] = "bitsandbytes", |
| 69 | auto_tool_choice: bool = True, |
| 70 | tool_call_parser: Optional[str] = "hermes", |
| 71 | ): |
| 72 | """Serves a vLLM model from command line. |
| 73 | |
| 74 | Args: |
| 75 | model_path: The path to the vLLM model. It can be either a local path or a Hugging Face model ID. |
| 76 | port: The port to serve the model on. |
| 77 | startup_timeout: The timeout for the server to start. |
| 78 | terminate_timeout: The timeout for the server to terminate. |
| 79 | max_model_len: The maximum model length. |
| 80 | gpu_memory_utilization: The GPU memory utilization for the server. Set it lower to avoid OOM. |
| 81 | quantization: The quantization method. |
| 82 | auto_tool_choice: Whether to enable auto tool choice. |
| 83 | tool_call_parser: The tool call parser to use. |
| 84 | """ |
| 85 | proc: Optional[subprocess.Popen[bytes]] = None |
| 86 | try: |
| 87 | vllm_serve_args = [ |
| 88 | "--gpu-memory-utilization", |
| 89 | str(gpu_memory_utilization), |
| 90 | "--max-model-len", |
| 91 | str(max_model_len), |
| 92 | "--port", |
| 93 | str(port), |
| 94 | ] |
| 95 | if quantization is not None: |
| 96 | vllm_serve_args.append("--quantization") |
| 97 | vllm_serve_args.append(quantization) |
| 98 | if auto_tool_choice: |
| 99 | vllm_serve_args.append("--enable-auto-tool-choice") |
| 100 | if tool_call_parser is not None: |
| 101 | vllm_serve_args.append("--tool-call-parser") |
| 102 | vllm_serve_args.append(tool_call_parser) |
| 103 | |
| 104 | proc = subprocess.Popen(["vllm", "serve", model_path, *vllm_serve_args]) |
| 105 | |
| 106 | # Wait for the server to be ready |
| 107 | url = f"http://localhost:{port}/health" |
| 108 | start = time.time() |
| 109 | client = httpx.Client() |
| 110 | |
| 111 | while True: |
| 112 | try: |
| 113 | if client.get(url).status_code == 200: |
| 114 | break |
| 115 | except Exception: |
| 116 | result = proc.poll() |
| 117 | if result is not None and result != 0: |
| 118 | raise RuntimeError("Server exited unexpectedly.") from None |
no test coverage detected