()
| 682 | print(f"{metric.replace('_', ' ').title()}: {count}/{total_problems} ({percentage:.1%})") |
| 683 | |
| 684 | def main(): |
| 685 | parser = argparse.ArgumentParser(description="Evaluate LLM performance on IMO 2025 problems") |
| 686 | parser.add_argument("--model", type=str, required=True, |
| 687 | help="Model to use (e.g., google/gemma-2.5-flash-lite)") |
| 688 | parser.add_argument("--approach", type=str, default="none", |
| 689 | help="OptiLLM approach to use (none, mars, moa, bon, etc.)") |
| 690 | parser.add_argument("--timeout", type=int, default=600, |
| 691 | help="Timeout in seconds for each problem (default: 600)") |
| 692 | parser.add_argument("--problems", type=str, |
| 693 | help="Comma-separated list of problem IDs to evaluate (e.g., '1,3,5')") |
| 694 | |
| 695 | args = parser.parse_args() |
| 696 | |
| 697 | # Setup results directory and filename |
| 698 | os.makedirs("results", exist_ok=True) |
| 699 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| 700 | results_file = f"results/imo25_{args.model.replace('/', '_')}_{args.approach}_{timestamp}.json" |
| 701 | |
| 702 | # Determine which problems to evaluate |
| 703 | if args.problems: |
| 704 | problem_ids = [int(x.strip()) for x in args.problems.split(',')] |
| 705 | problems_to_evaluate = [p for p in IMO_2025_PROBLEMS if p['id'] in problem_ids] |
| 706 | else: |
| 707 | problems_to_evaluate = IMO_2025_PROBLEMS |
| 708 | |
| 709 | print(f"Evaluating {len(problems_to_evaluate)} IMO 2025 problems") |
| 710 | print(f"Model: {args.model}") |
| 711 | print(f"Approach: {args.approach}") |
| 712 | print(f"Results will be saved to: {results_file}") |
| 713 | |
| 714 | # Prepare extra_body for approach |
| 715 | # Special handling for MARS on IMO problems: disable thinking tags for proofs |
| 716 | if args.approach == "mars": |
| 717 | extra_body = { |
| 718 | "optillm_approach": "mars", |
| 719 | "mars_config": { |
| 720 | "use_thinking_tags": False, # IMO proofs need full visibility to evaluator |
| 721 | "answer_extraction_mode": "none" # Don't extract - proofs ARE the answer |
| 722 | } |
| 723 | } |
| 724 | elif args.approach != "none": |
| 725 | extra_body = {"optillm_approach": args.approach} |
| 726 | else: |
| 727 | extra_body = None |
| 728 | |
| 729 | # Evaluate each problem |
| 730 | for problem_data in tqdm(problems_to_evaluate, desc="Solving IMO problems"): |
| 731 | logger.info(f"Evaluating Problem {problem_data['id']}: {problem_data['type']}") |
| 732 | |
| 733 | start_time = time.time() |
| 734 | |
| 735 | # Get LLM response |
| 736 | response = get_llm_response( |
| 737 | problem_data['problem'], |
| 738 | args.model, |
| 739 | extra_body, |
| 740 | args.timeout |
| 741 | ) |
no test coverage detected