hub / github.com/algorithmicsuperintelligence/optillm / main

Function main

scripts/eval_imo25_benchmark.py:684–768 · view source on GitHub ↗

()

Source from the content-addressed store, hash-verified

682	print(f"{metric.replace('_', ' ').title()}: {count}/{total_problems} ({percentage:.1%})")
683
684	def main():
685	parser = argparse.ArgumentParser(description="Evaluate LLM performance on IMO 2025 problems")
686	parser.add_argument("--model", type=str, required=True,
687	help="Model to use (e.g., google/gemma-2.5-flash-lite)")
688	parser.add_argument("--approach", type=str, default="none",
689	help="OptiLLM approach to use (none, mars, moa, bon, etc.)")
690	parser.add_argument("--timeout", type=int, default=600,
691	help="Timeout in seconds for each problem (default: 600)")
692	parser.add_argument("--problems", type=str,
693	help="Comma-separated list of problem IDs to evaluate (e.g., '1,3,5')")
694
695	args = parser.parse_args()
696
697	# Setup results directory and filename
698	os.makedirs("results", exist_ok=True)
699	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
700	results_file = f"results/imo25_{args.model.replace('/', '_')}_{args.approach}_{timestamp}.json"
701
702	# Determine which problems to evaluate
703	if args.problems:
704	problem_ids = [int(x.strip()) for x in args.problems.split(',')]
705	problems_to_evaluate = [p for p in IMO_2025_PROBLEMS if p['id'] in problem_ids]
706	else:
707	problems_to_evaluate = IMO_2025_PROBLEMS
708
709	print(f"Evaluating {len(problems_to_evaluate)} IMO 2025 problems")
710	print(f"Model: {args.model}")
711	print(f"Approach: {args.approach}")
712	print(f"Results will be saved to: {results_file}")
713
714	# Prepare extra_body for approach
715	# Special handling for MARS on IMO problems: disable thinking tags for proofs
716	if args.approach == "mars":
717	extra_body = {
718	"optillm_approach": "mars",
719	"mars_config": {
720	"use_thinking_tags": False, # IMO proofs need full visibility to evaluator
721	"answer_extraction_mode": "none" # Don't extract - proofs ARE the answer
722	}
723	}
724	elif args.approach != "none":
725	extra_body = {"optillm_approach": args.approach}
726	else:
727	extra_body = None
728
729	# Evaluate each problem
730	for problem_data in tqdm(problems_to_evaluate, desc="Solving IMO problems"):
731	logger.info(f"Evaluating Problem {problem_data['id']}: {problem_data['type']}")
732
733	start_time = time.time()
734
735	# Get LLM response
736	response = get_llm_response(
737	problem_data['problem'],
738	args.model,
739	extra_body,
740	args.timeout
741	)

Callers 1

eval_imo25_benchmark.pyFile · 0.70

Calls 5

evaluate_solutionFunction · 0.85

get_llm_responseFunction · 0.70

save_resultFunction · 0.70

load_existing_resultsFunction · 0.70

analyze_resultsFunction · 0.70

Tested by

no test coverage detected