MCPcopy Index your code
hub / github.com/evalplus/evalplus / test_solution_runtime

Function test_solution_runtime

tools/_experimental/evaluate_runtime.py:70–116  ·  view source on GitHub ↗
(
    dataset: str = "humaneval",
    task_id: str = "HumanEval/0",
    impl: str = "canonical",
    inputs: Union[str, List[List[Any]]] = "base_input",
)

Source from the content-addressed store, hash-verified

68
69
70def test_solution_runtime(
71 dataset: str = "humaneval",
72 task_id: str = "HumanEval/0",
73 impl: str = "canonical",
74 inputs: Union[str, List[List[Any]]] = "base_input",
75):
76 if "humaneval" in dataset:
77 problems, problem = get_human_eval_plus(), None
78 for p in problems:
79 if p["task_id"] == task_id:
80 problem = p
81 assert problem != None, f"invalid {task_id = }"
82 entry_point = problem["entry_point"]
83 impl = problem["prompt"] + (
84 impl if impl != "canonical" else problem["canonical_solution"]
85 )
86 if inputs == "base_input":
87 inputs = problem["base_input"]
88
89 results = [1000, 1000]
90 for input_list in inputs:
91 # choose warmup input
92 warmups = []
93 for base_input_list in problem["base_input"]:
94 if (
95 hash(str(base_input_list)) != hash(str(input_list))
96 and len(warmups) < MAX_WARMUP_LIMIT
97 ):
98 warmups.append(base_input_list)
99 runtime_list = [
100 execute_for_runtime(impl, input_list, warmups, entry_point)
101 for _ in range(RUN_REPEAT)
102 ]
103 if any(type(x) != float for x in runtime_list):
104 print(f"{task_id = } incorrect")
105 return None, None
106
107 avg_runtime = sum(runtime_list) / len(runtime_list)
108 sd = math.sqrt(
109 sum((runtime - avg_runtime) ** 2 for runtime in runtime_list)
110 / (RUN_REPEAT - 1)
111 )
112 if sd < results[1]:
113 results[0] = avg_runtime
114 results[1] = sd
115
116 return results

Callers

nothing calls this directly

Calls 2

get_human_eval_plusFunction · 0.90
execute_for_runtimeFunction · 0.85

Tested by

no test coverage detected