MCPcopy
hub / github.com/microsoft/agent-lightning / vllm_server

Function vllm_server

examples/unsloth/sft_algorithm.py:61–132  ·  view source on GitHub ↗

Serves a vLLM model from command line. Args: model_path: The path to the vLLM model. It can be either a local path or a Hugging Face model ID. port: The port to serve the model on. startup_timeout: The timeout for the server to start. terminate_timeout: The timeo

(
    model_path: str,
    port: int,
    startup_timeout: float = 300.0,
    terminate_timeout: float = 10.0,
    max_model_len: int = 32768,
    gpu_memory_utilization: float = 0.7,
    quantization: Optional[str] = "bitsandbytes",
    auto_tool_choice: bool = True,
    tool_call_parser: Optional[str] = "hermes",
)

Source from the content-addressed store, hash-verified

59
60@contextmanager
61def vllm_server(
62 model_path: str,
63 port: int,
64 startup_timeout: float = 300.0,
65 terminate_timeout: float = 10.0,
66 max_model_len: int = 32768,
67 gpu_memory_utilization: float = 0.7,
68 quantization: Optional[str] = "bitsandbytes",
69 auto_tool_choice: bool = True,
70 tool_call_parser: Optional[str] = "hermes",
71):
72 """Serves a vLLM model from command line.
73
74 Args:
75 model_path: The path to the vLLM model. It can be either a local path or a Hugging Face model ID.
76 port: The port to serve the model on.
77 startup_timeout: The timeout for the server to start.
78 terminate_timeout: The timeout for the server to terminate.
79 max_model_len: The maximum model length.
80 gpu_memory_utilization: The GPU memory utilization for the server. Set it lower to avoid OOM.
81 quantization: The quantization method.
82 auto_tool_choice: Whether to enable auto tool choice.
83 tool_call_parser: The tool call parser to use.
84 """
85 proc: Optional[subprocess.Popen[bytes]] = None
86 try:
87 vllm_serve_args = [
88 "--gpu-memory-utilization",
89 str(gpu_memory_utilization),
90 "--max-model-len",
91 str(max_model_len),
92 "--port",
93 str(port),
94 ]
95 if quantization is not None:
96 vllm_serve_args.append("--quantization")
97 vllm_serve_args.append(quantization)
98 if auto_tool_choice:
99 vllm_serve_args.append("--enable-auto-tool-choice")
100 if tool_call_parser is not None:
101 vllm_serve_args.append("--tool-call-parser")
102 vllm_serve_args.append(tool_call_parser)
103
104 proc = subprocess.Popen(["vllm", "serve", model_path, *vllm_serve_args])
105
106 # Wait for the server to be ready
107 url = f"http://localhost:{port}/health"
108 start = time.time()
109 client = httpx.Client()
110
111 while True:
112 try:
113 if client.get(url).status_code == 200:
114 break
115 except Exception:
116 result = proc.poll()
117 if result is not None and result != 0:
118 raise RuntimeError("Server exited unexpectedly.") from None

Callers 1

sft_one_iterFunction · 0.70

Calls 3

timeMethod · 0.80
getMethod · 0.45
waitMethod · 0.45

Tested by

no test coverage detected