(
dataset: str,
samples: Optional[str] = None,
base_only: bool = False,
parallel: Optional[int] = None,
i_just_wanna_run: bool = False,
test_details: bool = False,
min_time_limit: float = DEFAULT_MIN_TIME_LIMIT,
gt_time_limit_factor: float = DEFAULT_GT_TIME_LIMIT_FACTOR,
mini: bool = False,
noextreme: bool = False,
version: str = "default",
**model_kwargs,
)
| 125 | |
| 126 | |
| 127 | def evaluate( |
| 128 | dataset: str, |
| 129 | samples: Optional[str] = None, |
| 130 | base_only: bool = False, |
| 131 | parallel: Optional[int] = None, |
| 132 | i_just_wanna_run: bool = False, |
| 133 | test_details: bool = False, |
| 134 | min_time_limit: float = DEFAULT_MIN_TIME_LIMIT, |
| 135 | gt_time_limit_factor: float = DEFAULT_GT_TIME_LIMIT_FACTOR, |
| 136 | mini: bool = False, |
| 137 | noextreme: bool = False, |
| 138 | version: str = "default", |
| 139 | **model_kwargs, |
| 140 | ): |
| 141 | if model_kwargs: |
| 142 | # To suppress the warning of tokenizers |
| 143 | os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get( |
| 144 | "TOKENIZERS_PARALLELISM", "false" |
| 145 | ) |
| 146 | samples = run_codegen( |
| 147 | dataset=dataset, |
| 148 | **model_kwargs, |
| 149 | ) |
| 150 | assert samples is not None, "No samples provided" |
| 151 | |
| 152 | n_workers = parallel or max(1, multiprocessing.cpu_count() // 2) |
| 153 | |
| 154 | if os.path.isdir(samples): |
| 155 | result_path = os.path.join(samples, "eval_results.json") |
| 156 | else: |
| 157 | assert samples.endswith(".jsonl") |
| 158 | result_path = samples.replace(".jsonl", "_eval_results.json") |
| 159 | |
| 160 | if os.path.isfile(result_path) and not i_just_wanna_run: |
| 161 | print(f"Load from previous results from {result_path}") |
| 162 | with open(result_path, "r") as f: |
| 163 | results = json.load(f) |
| 164 | |
| 165 | results = compatible_eval_result(results) |
| 166 | else: |
| 167 | if dataset == "humaneval": |
| 168 | problems = get_human_eval_plus( |
| 169 | mini=mini, noextreme=noextreme, version=version |
| 170 | ) |
| 171 | dataset_hash = get_human_eval_plus_hash( |
| 172 | mini=mini, noextreme=noextreme, version=version |
| 173 | ) |
| 174 | expected_output = get_groundtruth(problems, dataset_hash, []) |
| 175 | elif dataset == "mbpp": |
| 176 | problems = get_mbpp_plus(mini=mini, noextreme=noextreme, version=version) |
| 177 | dataset_hash = get_mbpp_plus_hash( |
| 178 | mini=mini, noextreme=noextreme, version=version |
| 179 | ) |
| 180 | expected_output = get_groundtruth( |
| 181 | problems, |
| 182 | dataset_hash, |
| 183 | MBPP_OUTPUT_NOT_NONE_TASKS, |
| 184 | ) |
no test coverage detected