(
input_path: str = None,
output_path: str = None,
log_path: str = None,
tmp_dir: str = "./",
n_workers: int = 32,
timeout: float = 5.0,
k: List[int] = [1, 10, 100],
model_name: str = None,
problem_file: str = None,
language_type: str = None,
dataset_type: str = "humanevalx",
generation_mode: str = "completion",
test_groundtruth: bool = False,
)
| 92 | |
| 93 | |
| 94 | def evaluate_functional_correctness( |
| 95 | input_path: str = None, |
| 96 | output_path: str = None, |
| 97 | log_path: str = None, |
| 98 | tmp_dir: str = "./", |
| 99 | n_workers: int = 32, |
| 100 | timeout: float = 5.0, |
| 101 | k: List[int] = [1, 10, 100], |
| 102 | model_name: str = None, |
| 103 | problem_file: str = None, |
| 104 | language_type: str = None, |
| 105 | dataset_type: str = "humanevalx", |
| 106 | generation_mode: str = "completion", |
| 107 | test_groundtruth: bool = False, |
| 108 | ): |
| 109 | if log_path is None: |
| 110 | log_path = os.path.join(output_path, "evaluation.log") |
| 111 | logger = Logger(__name__, log_file=log_path) |
| 112 | |
| 113 | if os.path.isdir(input_path): |
| 114 | input_list = glob.glob(input_path + '/*generation*.jsonl') |
| 115 | sample_jsonl = [] |
| 116 | for input_file in input_list: |
| 117 | sample_jsonl += stream_jsonl_all(input_file) |
| 118 | else: |
| 119 | input_file = input_path |
| 120 | sample_jsonl = stream_jsonl_all(input_file) |
| 121 | |
| 122 | problems = read_dataset(problem_file, dataset_type=dataset_type) |
| 123 | |
| 124 | if output_path is not None: |
| 125 | os.makedirs(output_path, exist_ok=True) |
| 126 | |
| 127 | with ThreadPoolExecutor(max_workers=n_workers) as executor: |
| 128 | |
| 129 | futures = [] |
| 130 | completion_id = Counter() |
| 131 | n_samples = 0 |
| 132 | results = defaultdict(list) |
| 133 | |
| 134 | if test_groundtruth: |
| 135 | logger.info("Testing ground truth...") |
| 136 | else: |
| 137 | logger.info("Testing generation...") |
| 138 | for sample in sample_jsonl: |
| 139 | task_id = sample["task_id"] |
| 140 | if language_type is None: |
| 141 | language_type = LANGUAGE_NAME[task_id.split("/")[0]] |
| 142 | if test_groundtruth: |
| 143 | if dataset_type == "humanevalx": |
| 144 | sample["generation"] = sample["canonical_solution"] |
| 145 | sample["prompt"] = problems[task_id]["prompt"] |
| 146 | if dataset_type == "mbpp": |
| 147 | sample["generation"] = sample["code"] |
| 148 | sample["prompt"] = problems[task_id]["prompt"] |
| 149 | sample = postprocess_generation(sample, generation_mode) |
| 150 | sample["test_code"] = process_test(sample, problems, dataset_type, language_type, generation_mode) |
| 151 | if sample["test_code"] is None: |
nothing calls this directly
no test coverage detected