(task2bvs_old, task2bvs_new)
| 29 | |
| 30 | |
| 31 | def passk_rel_drop(task2bvs_old, task2bvs_new): |
| 32 | # old_rate: |
| 33 | # dim0: problems |
| 34 | # dim1: each experiment (model@temperature) |
| 35 | # dim2: pass/fail booleans for each sample |
| 36 | # this fn computes the relative drop in pass@k averaged over experiments |
| 37 | |
| 38 | passk_old = {} |
| 39 | passk_new = {} |
| 40 | # sample size => k => List[pass@k] |
| 41 | |
| 42 | for exp_i in range(len(task2bvs_old[0])): |
| 43 | ntotal = [] |
| 44 | npass_old = [] |
| 45 | npass_new = [] |
| 46 | nsamples = None |
| 47 | for task_i in range(len(task2bvs_old)): |
| 48 | bv_old = task2bvs_old[task_i][exp_i] |
| 49 | bv_new = task2bvs_new[task_i][exp_i] |
| 50 | ntotal.append(len(bv_old)) |
| 51 | npass_old.append(bv_old.sum()) |
| 52 | npass_new.append(bv_new.sum()) |
| 53 | if nsamples is None: |
| 54 | nsamples = len(bv_old) |
| 55 | assert len(bv_old) == len(bv_new) == nsamples |
| 56 | |
| 57 | d_old = passk_old.setdefault(nsamples, {}) |
| 58 | d_new = passk_new.setdefault(nsamples, {}) |
| 59 | for k in [1, 10, 100]: |
| 60 | if nsamples >= k: |
| 61 | pass_at_k_old = estimate_pass_at_k(ntotal, npass_old, k).mean() * 100 |
| 62 | pass_at_k_new = estimate_pass_at_k(ntotal, npass_new, k).mean() * 100 |
| 63 | d_old.setdefault(k, []).append(pass_at_k_old) |
| 64 | d_new.setdefault(k, []).append(pass_at_k_new) |
| 65 | |
| 66 | for nsamples in passk_old: |
| 67 | print("=====================================") |
| 68 | print(f"{nsamples = }") |
| 69 | do = passk_old[nsamples] |
| 70 | dn = passk_new[nsamples] |
| 71 | drops = [] |
| 72 | for k in [1, 10, 100]: |
| 73 | if k in do: |
| 74 | pko = np.array(do[k]) |
| 75 | pkn = np.array(dn[k]) |
| 76 | drop = 100 * (pko - pkn) / pko |
| 77 | drops.append(drop) |
| 78 | print( |
| 79 | f"pass@{k}: \t{pko.mean():.1f}% -> {pkn.mean():.1f}% (drop {drop.mean():.1f}%)" |
| 80 | ) |
| 81 | drops = np.array(drops) |
| 82 | print(f"+++ {drops.mean() = :.1f}%") |
| 83 | print(f"+++ {drops.max() = :.1f}%") |
| 84 | print("=====================================") |
| 85 | |
| 86 | |
| 87 | def get_data(paths: List[PathLike]): |
no test coverage detected