Analyze and print comprehensive statistics with full credit prioritized
(results: List[Dict])
| 324 | |
| 325 | |
| 326 | def analyze_results(results: List[Dict]): |
| 327 | """Analyze and print comprehensive statistics with full credit prioritized""" |
| 328 | if not results: |
| 329 | print("No results to analyze") |
| 330 | return |
| 331 | |
| 332 | total_problems = len(results) |
| 333 | full_marks = sum(1 for r in results if r['verification'].get('score', 0) == 7) |
| 334 | partial_credit = sum(1 for r in results if 1 <= r['verification'].get('score', 0) <= 6) |
| 335 | no_credit = total_problems - full_marks - partial_credit |
| 336 | |
| 337 | total_score = sum(r['verification'].get('score', 0) for r in results) |
| 338 | avg_score = total_score / total_problems |
| 339 | |
| 340 | print("\n" + "="*80) |
| 341 | print("IMO-Bench ProofBench Evaluation Results") |
| 342 | print("="*80) |
| 343 | |
| 344 | # ======================================================================== |
| 345 | # SECTION 1: FULL CREDIT SCORES (PRIMARY METRIC) |
| 346 | # ======================================================================== |
| 347 | print("\n" + "="*80) |
| 348 | print("FULL CREDIT SCORES (7/7 = Solved) - PRIMARY METRIC") |
| 349 | print("="*80) |
| 350 | print(f"\nOverall: {full_marks}/{total_problems} = {full_marks/total_problems*100:.1f}%") |
| 351 | |
| 352 | # Basic vs Advanced breakdown (full credit only) |
| 353 | basic_full = sum(1 for r in results if 'Basic' in r.get('problem_id', '') and r['verification'].get('score', 0) == 7) |
| 354 | basic_total = sum(1 for r in results if 'Basic' in r.get('problem_id', '')) |
| 355 | adv_full = sum(1 for r in results if 'Advanced' in r.get('problem_id', '') and r['verification'].get('score', 0) == 7) |
| 356 | adv_total = sum(1 for r in results if 'Advanced' in r.get('problem_id', '')) |
| 357 | |
| 358 | print(f"\nBasic problems: {basic_full}/{basic_total} = {basic_full/basic_total*100 if basic_total > 0 else 0:.1f}%") |
| 359 | print(f"Advanced problems: {adv_full}/{adv_total} = {adv_full/adv_total*100 if adv_total > 0 else 0:.1f}%") |
| 360 | |
| 361 | # ======================================================================== |
| 362 | # SECTION 2: SUBSET BREAKDOWN (Novel, IMO 2024, USAMO 2025) |
| 363 | # ======================================================================== |
| 364 | subset_stats = calculate_subset_scores(results) |
| 365 | |
| 366 | if any(total > 0 for _, total, _ in subset_stats.values()): |
| 367 | print("\n" + "-"*80) |
| 368 | print("Subset Breakdown (Full Credit Only):") |
| 369 | print("-"*80) |
| 370 | for name in ['Novel', 'IMO 2024', 'USAMO 2025']: |
| 371 | full, total, pct = subset_stats[name] |
| 372 | if total > 0: |
| 373 | print(f"{name:15s}: {full}/{total} = {pct:.1f}%") |
| 374 | |
| 375 | # ======================================================================== |
| 376 | # SECTION 3: DETAILED ANALYSIS (Average Scores and Distributions) |
| 377 | # ======================================================================== |
| 378 | print("\n" + "="*80) |
| 379 | print("DETAILED ANALYSIS (Average Scores)") |
| 380 | print("="*80) |
| 381 | print(f"\nAverage score: {avg_score:.2f}/7 ({avg_score/7*100:.1f}%)") |
| 382 | print(f"Full credit (7/7): {full_marks} ({full_marks/total_problems*100:.1f}%)") |
| 383 | print(f"Partial credit (1-6): {partial_credit} ({partial_credit/total_problems*100:.1f}%)") |
no test coverage detected