MCPcopy Index your code
hub / github.com/evalplus/evalplus / passk_rel_drop

Function passk_rel_drop

tools/viz_passrate.py:31–84  ·  view source on GitHub ↗
(task2bvs_old, task2bvs_new)

Source from the content-addressed store, hash-verified

29
30
31def passk_rel_drop(task2bvs_old, task2bvs_new):
32 # old_rate:
33 # dim0: problems
34 # dim1: each experiment (model@temperature)
35 # dim2: pass/fail booleans for each sample
36 # this fn computes the relative drop in pass@k averaged over experiments
37
38 passk_old = {}
39 passk_new = {}
40 # sample size => k => List[pass@k]
41
42 for exp_i in range(len(task2bvs_old[0])):
43 ntotal = []
44 npass_old = []
45 npass_new = []
46 nsamples = None
47 for task_i in range(len(task2bvs_old)):
48 bv_old = task2bvs_old[task_i][exp_i]
49 bv_new = task2bvs_new[task_i][exp_i]
50 ntotal.append(len(bv_old))
51 npass_old.append(bv_old.sum())
52 npass_new.append(bv_new.sum())
53 if nsamples is None:
54 nsamples = len(bv_old)
55 assert len(bv_old) == len(bv_new) == nsamples
56
57 d_old = passk_old.setdefault(nsamples, {})
58 d_new = passk_new.setdefault(nsamples, {})
59 for k in [1, 10, 100]:
60 if nsamples >= k:
61 pass_at_k_old = estimate_pass_at_k(ntotal, npass_old, k).mean() * 100
62 pass_at_k_new = estimate_pass_at_k(ntotal, npass_new, k).mean() * 100
63 d_old.setdefault(k, []).append(pass_at_k_old)
64 d_new.setdefault(k, []).append(pass_at_k_new)
65
66 for nsamples in passk_old:
67 print("=====================================")
68 print(f"{nsamples = }")
69 do = passk_old[nsamples]
70 dn = passk_new[nsamples]
71 drops = []
72 for k in [1, 10, 100]:
73 if k in do:
74 pko = np.array(do[k])
75 pkn = np.array(dn[k])
76 drop = 100 * (pko - pkn) / pko
77 drops.append(drop)
78 print(
79 f"pass@{k}: \t{pko.mean():.1f}% -> {pkn.mean():.1f}% (drop {drop.mean():.1f}%)"
80 )
81 drops = np.array(drops)
82 print(f"+++ {drops.mean() = :.1f}%")
83 print(f"+++ {drops.max() = :.1f}%")
84 print("=====================================")
85
86
87def get_data(paths: List[PathLike]):

Callers 1

viz_passrate.pyFile · 0.85

Calls 1

estimate_pass_at_kFunction · 0.90

Tested by

no test coverage detected