MCPcopy
hub / github.com/algorithmicsuperintelligence/optillm / analyze_results

Function analyze_results

scripts/eval_imo25_benchmark.py:615–682  ·  view source on GitHub ↗

Analyze and print comprehensive statistics of IMO evaluation results

(results: List[Dict], approach_name: str = None)

Source from the content-addressed store, hash-verified

613 return []
614
615def analyze_results(results: List[Dict], approach_name: str = None):
616 """Analyze and print comprehensive statistics of IMO evaluation results"""
617 if not results:
618 print("No results to analyze")
619 return
620
621 total_problems = len(results)
622 likely_correct = sum(1 for r in results if r['evaluation']['is_correct'])
623 high_confidence = sum(1 for r in results if r['evaluation']['confidence'] == 'high')
624
625 avg_correctness = sum(r['evaluation']['correctness_score'] for r in results) / total_problems
626 avg_completeness = sum(r['evaluation']['quality_analysis']['completeness_score'] for r in results) / total_problems
627
628 total_reasoning_tokens = sum(r['response']['reasoning_tokens'] for r in results)
629 avg_reasoning_tokens = total_reasoning_tokens / total_problems
630
631 print("\n" + "="*80)
632 print(f"IMO 2025 Evaluation Results - {approach_name or 'Baseline'}")
633 print("="*80)
634 print(f"Total problems attempted: {total_problems}")
635 print(f"Likely correct solutions: {likely_correct} ({likely_correct/total_problems:.1%})")
636 print(f"High confidence solutions: {high_confidence} ({high_confidence/total_problems:.1%})")
637 print(f"Average correctness score: {avg_correctness:.3f}")
638 print(f"Average completeness score: {avg_completeness:.3f}")
639 print(f"Total reasoning tokens used: {total_reasoning_tokens:,}")
640 print(f"Average reasoning tokens per problem: {avg_reasoning_tokens:.0f}")
641
642 # Problem type breakdown
643 print(f"\nProblem Type Breakdown:")
644 type_stats = {}
645 for result in results:
646 prob_type = result['problem_data']['type']
647 if prob_type not in type_stats:
648 type_stats[prob_type] = {'total': 0, 'correct': 0, 'scores': []}
649 type_stats[prob_type]['total'] += 1
650 if result['evaluation']['is_correct']:
651 type_stats[prob_type]['correct'] += 1
652 type_stats[prob_type]['scores'].append(result['evaluation']['correctness_score'])
653
654 for prob_type, stats in type_stats.items():
655 accuracy = stats['correct'] / stats['total']
656 avg_score = sum(stats['scores']) / len(stats['scores'])
657 print(f" {prob_type}: {stats['correct']}/{stats['total']} ({accuracy:.1%}) - Avg score: {avg_score:.3f}")
658
659 # Detailed problem results
660 print(f"\nDetailed Results:")
661 print("-" * 80)
662 for result in results:
663 prob_id = result['problem_data']['id']
664 prob_type = result['problem_data']['type']
665 tokens = result['response']['reasoning_tokens']
666 is_correct = result['evaluation']['is_correct']
667 verdict = result['evaluation']['verdict']
668 status = "✓" if is_correct else "✗"
669 print(f"Problem {prob_id} ({prob_type}): {status} {verdict} - {tokens:,} tokens")
670
671 # Quality analysis summary
672 print(f"\nSolution Quality Analysis:")

Callers 1

mainFunction · 0.70

Calls

no outgoing calls

Tested by

no test coverage detected