()
| 466 | |
| 467 | |
| 468 | def main(): |
| 469 | parser = argparse.ArgumentParser(description="Evaluate on IMO-Bench ProofBench") |
| 470 | parser.add_argument("--model", type=str, required=True, |
| 471 | help="Model to use (e.g., google/gemini-2.5-flash-preview-09-2025 or mars-...)") |
| 472 | parser.add_argument("--base-url", type=str, default="http://localhost:8001/v1", |
| 473 | help="Base URL for OptiLLM server") |
| 474 | parser.add_argument("--verifier-model", type=str, default=None, |
| 475 | help="Model to use for verification (defaults to same as solver)") |
| 476 | parser.add_argument("--timeout", type=int, default=600, |
| 477 | help="Timeout in seconds for each problem") |
| 478 | parser.add_argument("--limit", type=int, default=None, |
| 479 | help="Limit number of problems (for testing)") |
| 480 | parser.add_argument("--subset", type=str, default=None, |
| 481 | help="Evaluate only 'basic' or 'advanced' subset") |
| 482 | |
| 483 | args = parser.parse_args() |
| 484 | |
| 485 | # Initialize OpenAI client |
| 486 | client = OpenAI(api_key="optillm", base_url=args.base_url) |
| 487 | |
| 488 | # Verifier model defaults to solver model |
| 489 | verifier_model = args.verifier_model or args.model.replace("mars-", "") |
| 490 | |
| 491 | # Setup results directory |
| 492 | os.makedirs("results", exist_ok=True) |
| 493 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| 494 | |
| 495 | # Determine if using MARS |
| 496 | is_mars = args.model.startswith("mars-") |
| 497 | approach_name = "mars" if is_mars else "baseline" |
| 498 | model_name = args.model.replace("mars-", "") if is_mars else args.model |
| 499 | |
| 500 | results_file = f"results/imobench_proof_{approach_name}_{model_name.replace('/', '_')}_{timestamp}.json" |
| 501 | |
| 502 | # Download dataset |
| 503 | df = download_proofbench() |
| 504 | |
| 505 | # Filter by subset if specified |
| 506 | if args.subset: |
| 507 | if args.subset.lower() == 'basic': |
| 508 | df = df[df['Level'].str.contains('Basic', case=False, na=False)] |
| 509 | elif args.subset.lower() == 'advanced': |
| 510 | df = df[df['Level'].str.contains('Advanced', case=False, na=False)] |
| 511 | print(f"Filtered to {args.subset} subset") |
| 512 | |
| 513 | # Limit problems if specified |
| 514 | if args.limit: |
| 515 | df = df.head(args.limit) |
| 516 | |
| 517 | print(f"\nEvaluating {len(df)} ProofBench problems") |
| 518 | print(f"Model: {args.model}") |
| 519 | print(f"Approach: {approach_name}") |
| 520 | print(f"Verifier: {verifier_model}") |
| 521 | if is_mars: |
| 522 | print("MARS Config: use_thinking_tags=False, answer_extraction_mode='none'") |
| 523 | print(f"Results will be saved to: {results_file}\n") |
| 524 | |
| 525 | # Evaluate each problem |
no test coverage detected