(args: argparse.Namespace)
| 378 | |
| 379 | |
| 380 | def _run_server(args: argparse.Namespace) -> None: |
| 381 | is_vllm = args.backend == "vllm" |
| 382 | dataset = load_and_process_dataset(args.dataset) |
| 383 | tokenizer = None |
| 384 | if not is_vllm: |
| 385 | from transformers import AutoTokenizer |
| 386 | |
| 387 | tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) |
| 388 | |
| 389 | num_prompts = args.num_prompts + args.concurrency |
| 390 | prompts: list[str] = [] |
| 391 | for i in range(num_prompts): |
| 392 | item = dataset[i % len(dataset)] |
| 393 | user_content = item["turns"][0] |
| 394 | if is_vllm: |
| 395 | prompts.append(user_content) |
| 396 | else: |
| 397 | prompts.append(_apply_chat_template( |
| 398 | tokenizer, |
| 399 | [{"role": "user", "content": user_content}], |
| 400 | args.enable_thinking, |
| 401 | )) |
| 402 | |
| 403 | def send_one(prompt: str) -> dict: |
| 404 | if is_vllm: |
| 405 | return _send_vllm( |
| 406 | args.base_url, |
| 407 | prompt, |
| 408 | model=args.model, |
| 409 | max_new_tokens=args.max_new_tokens, |
| 410 | temperature=args.temperature, |
| 411 | top_p=args.top_p, |
| 412 | top_k=args.top_k, |
| 413 | timeout_s=args.timeout_s, |
| 414 | enable_thinking=args.enable_thinking, |
| 415 | ) |
| 416 | return _send_sglang( |
| 417 | args.base_url, |
| 418 | prompt, |
| 419 | max_new_tokens=args.max_new_tokens, |
| 420 | temperature=args.temperature, |
| 421 | top_p=args.top_p, |
| 422 | top_k=args.top_k, |
| 423 | timeout_s=args.timeout_s, |
| 424 | ) |
| 425 | |
| 426 | if not is_vllm: |
| 427 | try: |
| 428 | requests.get(args.base_url + "/flush_cache", timeout=60).raise_for_status() |
| 429 | except Exception: |
| 430 | print("Warning: /flush_cache failed. Continuing.") |
| 431 | |
| 432 | bs = max(args.concurrency, 1) |
| 433 | if len(prompts) > bs: |
| 434 | print(f"[warmup] {bs} requests ...") |
| 435 | with ThreadPoolExecutor(max_workers=bs) as pool: |
| 436 | list(pool.map(send_one, prompts[:bs])) |
| 437 | prompts = prompts[bs:] |
no test coverage detected