hub / github.com/evalplus/evalplus / test_solution_runtime

Function test_solution_runtime

tools/_experimental/evaluate_runtime.py:70–116 · view source on GitHub ↗

(
    dataset: str = "humaneval",
    task_id: str = "HumanEval/0",
    impl: str = "canonical",
    inputs: Union[str, List[List[Any]]] = "base_input",
)

Source from the content-addressed store, hash-verified

68
69
70	def test_solution_runtime(
71	dataset: str = "humaneval",
72	task_id: str = "HumanEval/0",
73	impl: str = "canonical",
74	inputs: Union[str, List[List[Any]]] = "base_input",
75	):
76	if "humaneval" in dataset:
77	problems, problem = get_human_eval_plus(), None
78	for p in problems:
79	if p["task_id"] == task_id:
80	problem = p
81	assert problem != None, f"invalid {task_id = }"
82	entry_point = problem["entry_point"]
83	impl = problem["prompt"] + (
84	impl if impl != "canonical" else problem["canonical_solution"]
85	)
86	if inputs == "base_input":
87	inputs = problem["base_input"]
88
89	results = [1000, 1000]
90	for input_list in inputs:
91	# choose warmup input
92	warmups = []
93	for base_input_list in problem["base_input"]:
94	if (
95	hash(str(base_input_list)) != hash(str(input_list))
96	and len(warmups) < MAX_WARMUP_LIMIT
97	):
98	warmups.append(base_input_list)
99	runtime_list = [
100	execute_for_runtime(impl, input_list, warmups, entry_point)
101	for _ in range(RUN_REPEAT)
102	]
103	if any(type(x) != float for x in runtime_list):
104	print(f"{task_id = } incorrect")
105	return None, None
106
107	avg_runtime = sum(runtime_list) / len(runtime_list)
108	sd = math.sqrt(
109	sum((runtime - avg_runtime) ** 2 for runtime in runtime_list)
110	/ (RUN_REPEAT - 1)
111	)
112	if sd < results[1]:
113	results[0] = avg_runtime
114	results[1] = sd
115
116	return results

Callers

nothing calls this directly

Calls 2

get_human_eval_plusFunction · 0.90

execute_for_runtimeFunction · 0.85

Tested by

no test coverage detected