MCPcopy Index your code
hub / github.com/PRIME-RL/PRIME / evaluate

Function evaluate

eval/Math/Qwen25-Math/evaluation/evaluate.py:14–93  ·  view source on GitHub ↗
(data_name, prompt_type, samples: list=None, file_path: str=None, max_num_samples=None, execute=False)

Source from the content-addressed store, hash-verified

12
13
14def evaluate(data_name, prompt_type, samples: list=None, file_path: str=None, max_num_samples=None, execute=False):
15 assert samples or file_path, "samples or file_path must be provided"
16 if not samples:
17 samples = list(load_jsonl(file_path))
18 if 'idx' in samples[0]:
19 samples = {sample['idx']: sample for sample in samples}.values()
20 samples = sorted(samples, key=lambda x: x['idx'])
21 else:
22 samples = [dict(idx=idx, **sample) for idx, sample in enumerate(samples)]
23
24 if max_num_samples:
25 print(f"max_num_samples: {max_num_samples} / {len(samples)}")
26 samples = samples[:max_num_samples]
27
28 # parse gt
29 for sample in samples:
30 sample['gt_cot'], sample['gt'] = parse_ground_truth(sample, data_name)
31 params = [(idx, pred, sample['gt']) for idx, sample in enumerate(samples) for pred in sample['pred']]
32
33 scores = []
34 timeout_cnt = 0
35
36 with ProcessPool(max_workers=1) as pool:
37 future = pool.map(math_equal_process, params, timeout=3)
38 iterator = future.result()
39 with tqdm(total=len(samples), desc="Evaluate") as progress_bar:
40 while True:
41 try:
42 result = next(iterator)
43 scores.append(result)
44 except StopIteration:
45 break
46 except TimeoutError as error:
47 print(error)
48 scores.append(False)
49 timeout_cnt += 1
50 except Exception as error:
51 print(error.traceback)
52 exit()
53 progress_bar.update(1)
54
55 idx = 0
56 score_mat = []
57 for sample in samples:
58 sample['score'] = scores[idx: idx+len(sample['pred'])]
59 assert len(sample['score']) == len(sample['pred'])
60 score_mat.append(sample['score'])
61 idx += len(sample['pred'])
62
63 max_len = max([len(s) for s in score_mat])
64
65 for i, s in enumerate(score_mat):
66 if len(s) < max_len:
67 score_mat[i] = s + [s[-1]] * (max_len - len(s)) # pad
68
69 # output mean of each column of scores
70 col_means= np.array(score_mat).mean(axis=0)
71 mean_score = list(np.round(col_means * 100, decimals=1))

Callers 2

mainFunction · 0.90
evaluate.pyFile · 0.70

Calls 3

load_jsonlFunction · 0.90
parse_ground_truthFunction · 0.85
updateMethod · 0.45

Tested by

no test coverage detected