Analyze and print comprehensive statistics of IMO evaluation results
(results: List[Dict], approach_name: str = None)
| 613 | return [] |
| 614 | |
| 615 | def analyze_results(results: List[Dict], approach_name: str = None): |
| 616 | """Analyze and print comprehensive statistics of IMO evaluation results""" |
| 617 | if not results: |
| 618 | print("No results to analyze") |
| 619 | return |
| 620 | |
| 621 | total_problems = len(results) |
| 622 | likely_correct = sum(1 for r in results if r['evaluation']['is_correct']) |
| 623 | high_confidence = sum(1 for r in results if r['evaluation']['confidence'] == 'high') |
| 624 | |
| 625 | avg_correctness = sum(r['evaluation']['correctness_score'] for r in results) / total_problems |
| 626 | avg_completeness = sum(r['evaluation']['quality_analysis']['completeness_score'] for r in results) / total_problems |
| 627 | |
| 628 | total_reasoning_tokens = sum(r['response']['reasoning_tokens'] for r in results) |
| 629 | avg_reasoning_tokens = total_reasoning_tokens / total_problems |
| 630 | |
| 631 | print("\n" + "="*80) |
| 632 | print(f"IMO 2025 Evaluation Results - {approach_name or 'Baseline'}") |
| 633 | print("="*80) |
| 634 | print(f"Total problems attempted: {total_problems}") |
| 635 | print(f"Likely correct solutions: {likely_correct} ({likely_correct/total_problems:.1%})") |
| 636 | print(f"High confidence solutions: {high_confidence} ({high_confidence/total_problems:.1%})") |
| 637 | print(f"Average correctness score: {avg_correctness:.3f}") |
| 638 | print(f"Average completeness score: {avg_completeness:.3f}") |
| 639 | print(f"Total reasoning tokens used: {total_reasoning_tokens:,}") |
| 640 | print(f"Average reasoning tokens per problem: {avg_reasoning_tokens:.0f}") |
| 641 | |
| 642 | # Problem type breakdown |
| 643 | print(f"\nProblem Type Breakdown:") |
| 644 | type_stats = {} |
| 645 | for result in results: |
| 646 | prob_type = result['problem_data']['type'] |
| 647 | if prob_type not in type_stats: |
| 648 | type_stats[prob_type] = {'total': 0, 'correct': 0, 'scores': []} |
| 649 | type_stats[prob_type]['total'] += 1 |
| 650 | if result['evaluation']['is_correct']: |
| 651 | type_stats[prob_type]['correct'] += 1 |
| 652 | type_stats[prob_type]['scores'].append(result['evaluation']['correctness_score']) |
| 653 | |
| 654 | for prob_type, stats in type_stats.items(): |
| 655 | accuracy = stats['correct'] / stats['total'] |
| 656 | avg_score = sum(stats['scores']) / len(stats['scores']) |
| 657 | print(f" {prob_type}: {stats['correct']}/{stats['total']} ({accuracy:.1%}) - Avg score: {avg_score:.3f}") |
| 658 | |
| 659 | # Detailed problem results |
| 660 | print(f"\nDetailed Results:") |
| 661 | print("-" * 80) |
| 662 | for result in results: |
| 663 | prob_id = result['problem_data']['id'] |
| 664 | prob_type = result['problem_data']['type'] |
| 665 | tokens = result['response']['reasoning_tokens'] |
| 666 | is_correct = result['evaluation']['is_correct'] |
| 667 | verdict = result['evaluation']['verdict'] |
| 668 | status = "✓" if is_correct else "✗" |
| 669 | print(f"Problem {prob_id} ({prob_type}): {status} {verdict} - {tokens:,} tokens") |
| 670 | |
| 671 | # Quality analysis summary |
| 672 | print(f"\nSolution Quality Analysis:") |