hub / github.com/z-lab/dflash / _run_server

Function _run_server

dflash/benchmark.py:380–477 · view source on GitHub ↗

(args: argparse.Namespace)

Source from the content-addressed store, hash-verified

378
379
380	def _run_server(args: argparse.Namespace) -> None:
381	is_vllm = args.backend == "vllm"
382	dataset = load_and_process_dataset(args.dataset)
383	tokenizer = None
384	if not is_vllm:
385	from transformers import AutoTokenizer
386
387	tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
388
389	num_prompts = args.num_prompts + args.concurrency
390	prompts: list[str] = []
391	for i in range(num_prompts):
392	item = dataset[i % len(dataset)]
393	user_content = item["turns"][0]
394	if is_vllm:
395	prompts.append(user_content)
396	else:
397	prompts.append(_apply_chat_template(
398	tokenizer,
399	[{"role": "user", "content": user_content}],
400	args.enable_thinking,
401	))
402
403	def send_one(prompt: str) -> dict:
404	if is_vllm:
405	return _send_vllm(
406	args.base_url,
407	prompt,
408	model=args.model,
409	max_new_tokens=args.max_new_tokens,
410	temperature=args.temperature,
411	top_p=args.top_p,
412	top_k=args.top_k,
413	timeout_s=args.timeout_s,
414	enable_thinking=args.enable_thinking,
415	)
416	return _send_sglang(
417	args.base_url,
418	prompt,
419	max_new_tokens=args.max_new_tokens,
420	temperature=args.temperature,
421	top_p=args.top_p,
422	top_k=args.top_k,
423	timeout_s=args.timeout_s,
424	)
425
426	if not is_vllm:
427	try:
428	requests.get(args.base_url + "/flush_cache", timeout=60).raise_for_status()
429	except Exception:
430	print("Warning: /flush_cache failed. Continuing.")
431
432	bs = max(args.concurrency, 1)
433	if len(prompts) > bs:
434	print(f"[warmup] {bs} requests ...")
435	with ThreadPoolExecutor(max_workers=bs) as pool:
436	list(pool.map(send_one, prompts[:bs]))
437	prompts = prompts[bs:]

Callers 1

mainFunction · 0.85

Calls 2

load_and_process_datasetFunction · 0.85

_apply_chat_templateFunction · 0.85

Tested by

no test coverage detected