MCPcopy
hub / github.com/algorithmicsuperintelligence/optillm / main

Function main

scripts/eval_imo25_benchmark.py:684–768  ·  view source on GitHub ↗
()

Source from the content-addressed store, hash-verified

682 print(f"{metric.replace('_', ' ').title()}: {count}/{total_problems} ({percentage:.1%})")
683
684def main():
685 parser = argparse.ArgumentParser(description="Evaluate LLM performance on IMO 2025 problems")
686 parser.add_argument("--model", type=str, required=True,
687 help="Model to use (e.g., google/gemma-2.5-flash-lite)")
688 parser.add_argument("--approach", type=str, default="none",
689 help="OptiLLM approach to use (none, mars, moa, bon, etc.)")
690 parser.add_argument("--timeout", type=int, default=600,
691 help="Timeout in seconds for each problem (default: 600)")
692 parser.add_argument("--problems", type=str,
693 help="Comma-separated list of problem IDs to evaluate (e.g., '1,3,5')")
694
695 args = parser.parse_args()
696
697 # Setup results directory and filename
698 os.makedirs("results", exist_ok=True)
699 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
700 results_file = f"results/imo25_{args.model.replace('/', '_')}_{args.approach}_{timestamp}.json"
701
702 # Determine which problems to evaluate
703 if args.problems:
704 problem_ids = [int(x.strip()) for x in args.problems.split(',')]
705 problems_to_evaluate = [p for p in IMO_2025_PROBLEMS if p['id'] in problem_ids]
706 else:
707 problems_to_evaluate = IMO_2025_PROBLEMS
708
709 print(f"Evaluating {len(problems_to_evaluate)} IMO 2025 problems")
710 print(f"Model: {args.model}")
711 print(f"Approach: {args.approach}")
712 print(f"Results will be saved to: {results_file}")
713
714 # Prepare extra_body for approach
715 # Special handling for MARS on IMO problems: disable thinking tags for proofs
716 if args.approach == "mars":
717 extra_body = {
718 "optillm_approach": "mars",
719 "mars_config": {
720 "use_thinking_tags": False, # IMO proofs need full visibility to evaluator
721 "answer_extraction_mode": "none" # Don't extract - proofs ARE the answer
722 }
723 }
724 elif args.approach != "none":
725 extra_body = {"optillm_approach": args.approach}
726 else:
727 extra_body = None
728
729 # Evaluate each problem
730 for problem_data in tqdm(problems_to_evaluate, desc="Solving IMO problems"):
731 logger.info(f"Evaluating Problem {problem_data['id']}: {problem_data['type']}")
732
733 start_time = time.time()
734
735 # Get LLM response
736 response = get_llm_response(
737 problem_data['problem'],
738 args.model,
739 extra_body,
740 args.timeout
741 )

Callers 1

Calls 5

evaluate_solutionFunction · 0.85
get_llm_responseFunction · 0.70
save_resultFunction · 0.70
load_existing_resultsFunction · 0.70
analyze_resultsFunction · 0.70

Tested by

no test coverage detected