hub / github.com/algorithmicsuperintelligence/optillm / analyze_results

Function analyze_results

scripts/eval_imo25_benchmark.py:615–682 · view source on GitHub ↗

Analyze and print comprehensive statistics of IMO evaluation results

(results: List[Dict], approach_name: str = None)

Source from the content-addressed store, hash-verified

613	return []
614
615	def analyze_results(results: List[Dict], approach_name: str = None):
616	"""Analyze and print comprehensive statistics of IMO evaluation results"""
617	if not results:
618	print("No results to analyze")
619	return
620
621	total_problems = len(results)
622	likely_correct = sum(1 for r in results if r['evaluation']['is_correct'])
623	high_confidence = sum(1 for r in results if r['evaluation']['confidence'] == 'high')
624
625	avg_correctness = sum(r['evaluation']['correctness_score'] for r in results) / total_problems
626	avg_completeness = sum(r['evaluation']['quality_analysis']['completeness_score'] for r in results) / total_problems
627
628	total_reasoning_tokens = sum(r['response']['reasoning_tokens'] for r in results)
629	avg_reasoning_tokens = total_reasoning_tokens / total_problems
630
631	print("\n" + "="*80)
632	print(f"IMO 2025 Evaluation Results - {approach_name or 'Baseline'}")
633	print("="*80)
634	print(f"Total problems attempted: {total_problems}")
635	print(f"Likely correct solutions: {likely_correct} ({likely_correct/total_problems:.1%})")
636	print(f"High confidence solutions: {high_confidence} ({high_confidence/total_problems:.1%})")
637	print(f"Average correctness score: {avg_correctness:.3f}")
638	print(f"Average completeness score: {avg_completeness:.3f}")
639	print(f"Total reasoning tokens used: {total_reasoning_tokens:,}")
640	print(f"Average reasoning tokens per problem: {avg_reasoning_tokens:.0f}")
641
642	# Problem type breakdown
643	print(f"\nProblem Type Breakdown:")
644	type_stats = {}
645	for result in results:
646	prob_type = result['problem_data']['type']
647	if prob_type not in type_stats:
648	type_stats[prob_type] = {'total': 0, 'correct': 0, 'scores': []}
649	type_stats[prob_type]['total'] += 1
650	if result['evaluation']['is_correct']:
651	type_stats[prob_type]['correct'] += 1
652	type_stats[prob_type]['scores'].append(result['evaluation']['correctness_score'])
653
654	for prob_type, stats in type_stats.items():
655	accuracy = stats['correct'] / stats['total']
656	avg_score = sum(stats['scores']) / len(stats['scores'])
657	print(f" {prob_type}: {stats['correct']}/{stats['total']} ({accuracy:.1%}) - Avg score: {avg_score:.3f}")
658
659	# Detailed problem results
660	print(f"\nDetailed Results:")
661	print("-" * 80)
662	for result in results:
663	prob_id = result['problem_data']['id']
664	prob_type = result['problem_data']['type']
665	tokens = result['response']['reasoning_tokens']
666	is_correct = result['evaluation']['is_correct']
667	verdict = result['evaluation']['verdict']
668	status = "✓" if is_correct else "✗"
669	print(f"Problem {prob_id} ({prob_type}): {status} {verdict} - {tokens:,} tokens")
670
671	# Quality analysis summary
672	print(f"\nSolution Quality Analysis:")

Callers 1

mainFunction · 0.70

Calls

no outgoing calls

Tested by

no test coverage detected