(data_name, prompt_type, samples: list=None, file_path: str=None, max_num_samples=None, execute=False)
| 12 | |
| 13 | |
| 14 | def evaluate(data_name, prompt_type, samples: list=None, file_path: str=None, max_num_samples=None, execute=False): |
| 15 | assert samples or file_path, "samples or file_path must be provided" |
| 16 | if not samples: |
| 17 | samples = list(load_jsonl(file_path)) |
| 18 | if 'idx' in samples[0]: |
| 19 | samples = {sample['idx']: sample for sample in samples}.values() |
| 20 | samples = sorted(samples, key=lambda x: x['idx']) |
| 21 | else: |
| 22 | samples = [dict(idx=idx, **sample) for idx, sample in enumerate(samples)] |
| 23 | |
| 24 | if max_num_samples: |
| 25 | print(f"max_num_samples: {max_num_samples} / {len(samples)}") |
| 26 | samples = samples[:max_num_samples] |
| 27 | |
| 28 | # parse gt |
| 29 | for sample in samples: |
| 30 | sample['gt_cot'], sample['gt'] = parse_ground_truth(sample, data_name) |
| 31 | params = [(idx, pred, sample['gt']) for idx, sample in enumerate(samples) for pred in sample['pred']] |
| 32 | |
| 33 | scores = [] |
| 34 | timeout_cnt = 0 |
| 35 | |
| 36 | with ProcessPool(max_workers=1) as pool: |
| 37 | future = pool.map(math_equal_process, params, timeout=3) |
| 38 | iterator = future.result() |
| 39 | with tqdm(total=len(samples), desc="Evaluate") as progress_bar: |
| 40 | while True: |
| 41 | try: |
| 42 | result = next(iterator) |
| 43 | scores.append(result) |
| 44 | except StopIteration: |
| 45 | break |
| 46 | except TimeoutError as error: |
| 47 | print(error) |
| 48 | scores.append(False) |
| 49 | timeout_cnt += 1 |
| 50 | except Exception as error: |
| 51 | print(error.traceback) |
| 52 | exit() |
| 53 | progress_bar.update(1) |
| 54 | |
| 55 | idx = 0 |
| 56 | score_mat = [] |
| 57 | for sample in samples: |
| 58 | sample['score'] = scores[idx: idx+len(sample['pred'])] |
| 59 | assert len(sample['score']) == len(sample['pred']) |
| 60 | score_mat.append(sample['score']) |
| 61 | idx += len(sample['pred']) |
| 62 | |
| 63 | max_len = max([len(s) for s in score_mat]) |
| 64 | |
| 65 | for i, s in enumerate(score_mat): |
| 66 | if len(s) < max_len: |
| 67 | score_mat[i] = s + [s[-1]] * (max_len - len(s)) # pad |
| 68 | |
| 69 | # output mean of each column of scores |
| 70 | col_means= np.array(score_mat).mean(axis=0) |
| 71 | mean_score = list(np.round(col_means * 100, decimals=1)) |
no test coverage detected