(
samples: str, dataset: str, nsample_check: int = None, verbose: bool = False
)
| 22 | |
| 23 | |
| 24 | def script( |
| 25 | samples: str, dataset: str, nsample_check: int = None, verbose: bool = False |
| 26 | ): |
| 27 | # List[Dict{"task_id", "solution"}] |
| 28 | solutions = load_solutions(samples) |
| 29 | |
| 30 | if dataset == "humaneval": |
| 31 | from evalplus.data import get_human_eval_plus |
| 32 | |
| 33 | dataset = get_human_eval_plus() |
| 34 | dataset_name = "HumanEval" |
| 35 | elif dataset == "mbpp": |
| 36 | from evalplus.data import get_mbpp_plus |
| 37 | |
| 38 | dataset = get_mbpp_plus() |
| 39 | dataset_name = "Mbpp" |
| 40 | |
| 41 | print(colored(f"Dataset: {dataset_name}", "blue")) |
| 42 | |
| 43 | id2solutions = {} |
| 44 | for solution in solutions: |
| 45 | task_id = solution["task_id"] |
| 46 | if task_id not in id2solutions: |
| 47 | id2solutions[task_id] = [] |
| 48 | if "solution" not in solution: |
| 49 | assert "completion" in solution, "solution or completion must exist!" |
| 50 | solution["solution"] = dataset[task_id]["prompt"] + solution["completion"] |
| 51 | id2solutions[task_id].append(solution) |
| 52 | |
| 53 | print(colored("==============================", "blue")) |
| 54 | print(colored(" ::: Checking completeness... ", "blue")) |
| 55 | print(colored(" ::::: All tasks complete? ", "blue")) |
| 56 | ndone = 0 |
| 57 | |
| 58 | task_ids = dataset.keys() |
| 59 | ntask = len(task_ids) |
| 60 | for task_id in task_ids: |
| 61 | if task_id not in id2solutions: |
| 62 | print(colored(f" ⚠️ {task_id} is missing!", "red")) |
| 63 | continue |
| 64 | nfiles = len(id2solutions[task_id]) |
| 65 | |
| 66 | if nsample_check is None or nfiles <= nsample_check: |
| 67 | ndone += 1 |
| 68 | continue |
| 69 | |
| 70 | print( |
| 71 | colored( |
| 72 | f" ⚠️ {task_id} only has {nfiles} samples! But {nsample_check} are expected.", |
| 73 | "red", |
| 74 | ) |
| 75 | ) |
| 76 | |
| 77 | # check if there is enough number of samples here. |
| 78 | if nsample_check is not None: |
| 79 | if ntask != ndone: |
| 80 | ntbd = ntask - ndone |
| 81 | print(colored(f" ::::: ⚠️ {ntbd}/{ntask} tasks incomplete!", "red")) |
nothing calls this directly
no test coverage detected