(
dataset: str = "humaneval",
task_id: str = "HumanEval/0",
impl: str = "canonical",
inputs: Union[str, List[List[Any]]] = "base_input",
)
| 68 | |
| 69 | |
| 70 | def test_solution_runtime( |
| 71 | dataset: str = "humaneval", |
| 72 | task_id: str = "HumanEval/0", |
| 73 | impl: str = "canonical", |
| 74 | inputs: Union[str, List[List[Any]]] = "base_input", |
| 75 | ): |
| 76 | if "humaneval" in dataset: |
| 77 | problems, problem = get_human_eval_plus(), None |
| 78 | for p in problems: |
| 79 | if p["task_id"] == task_id: |
| 80 | problem = p |
| 81 | assert problem != None, f"invalid {task_id = }" |
| 82 | entry_point = problem["entry_point"] |
| 83 | impl = problem["prompt"] + ( |
| 84 | impl if impl != "canonical" else problem["canonical_solution"] |
| 85 | ) |
| 86 | if inputs == "base_input": |
| 87 | inputs = problem["base_input"] |
| 88 | |
| 89 | results = [1000, 1000] |
| 90 | for input_list in inputs: |
| 91 | # choose warmup input |
| 92 | warmups = [] |
| 93 | for base_input_list in problem["base_input"]: |
| 94 | if ( |
| 95 | hash(str(base_input_list)) != hash(str(input_list)) |
| 96 | and len(warmups) < MAX_WARMUP_LIMIT |
| 97 | ): |
| 98 | warmups.append(base_input_list) |
| 99 | runtime_list = [ |
| 100 | execute_for_runtime(impl, input_list, warmups, entry_point) |
| 101 | for _ in range(RUN_REPEAT) |
| 102 | ] |
| 103 | if any(type(x) != float for x in runtime_list): |
| 104 | print(f"{task_id = } incorrect") |
| 105 | return None, None |
| 106 | |
| 107 | avg_runtime = sum(runtime_list) / len(runtime_list) |
| 108 | sd = math.sqrt( |
| 109 | sum((runtime - avg_runtime) ** 2 for runtime in runtime_list) |
| 110 | / (RUN_REPEAT - 1) |
| 111 | ) |
| 112 | if sd < results[1]: |
| 113 | results[0] = avg_runtime |
| 114 | results[1] = sd |
| 115 | |
| 116 | return results |
nothing calls this directly
no test coverage detected