MCPcopy
hub / github.com/z-lab/dflash / _run_server

Function _run_server

dflash/benchmark.py:380–477  ·  view source on GitHub ↗
(args: argparse.Namespace)

Source from the content-addressed store, hash-verified

378
379
380def _run_server(args: argparse.Namespace) -> None:
381 is_vllm = args.backend == "vllm"
382 dataset = load_and_process_dataset(args.dataset)
383 tokenizer = None
384 if not is_vllm:
385 from transformers import AutoTokenizer
386
387 tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
388
389 num_prompts = args.num_prompts + args.concurrency
390 prompts: list[str] = []
391 for i in range(num_prompts):
392 item = dataset[i % len(dataset)]
393 user_content = item["turns"][0]
394 if is_vllm:
395 prompts.append(user_content)
396 else:
397 prompts.append(_apply_chat_template(
398 tokenizer,
399 [{"role": "user", "content": user_content}],
400 args.enable_thinking,
401 ))
402
403 def send_one(prompt: str) -> dict:
404 if is_vllm:
405 return _send_vllm(
406 args.base_url,
407 prompt,
408 model=args.model,
409 max_new_tokens=args.max_new_tokens,
410 temperature=args.temperature,
411 top_p=args.top_p,
412 top_k=args.top_k,
413 timeout_s=args.timeout_s,
414 enable_thinking=args.enable_thinking,
415 )
416 return _send_sglang(
417 args.base_url,
418 prompt,
419 max_new_tokens=args.max_new_tokens,
420 temperature=args.temperature,
421 top_p=args.top_p,
422 top_k=args.top_k,
423 timeout_s=args.timeout_s,
424 )
425
426 if not is_vllm:
427 try:
428 requests.get(args.base_url + "/flush_cache", timeout=60).raise_for_status()
429 except Exception:
430 print("Warning: /flush_cache failed. Continuing.")
431
432 bs = max(args.concurrency, 1)
433 if len(prompts) > bs:
434 print(f"[warmup] {bs} requests ...")
435 with ThreadPoolExecutor(max_workers=bs) as pool:
436 list(pool.map(send_one, prompts[:bs]))
437 prompts = prompts[bs:]

Callers 1

mainFunction · 0.85

Calls 2

load_and_process_datasetFunction · 0.85
_apply_chat_templateFunction · 0.85

Tested by

no test coverage detected