MCPcopy Index your code
hub / github.com/algorithmicsuperintelligence/optillm / main

Function main

scripts/eval_imobench_proof.py:468–596  ·  view source on GitHub ↗
()

Source from the content-addressed store, hash-verified

466
467
468def main():
469 parser = argparse.ArgumentParser(description="Evaluate on IMO-Bench ProofBench")
470 parser.add_argument("--model", type=str, required=True,
471 help="Model to use (e.g., google/gemini-2.5-flash-preview-09-2025 or mars-...)")
472 parser.add_argument("--base-url", type=str, default="http://localhost:8001/v1",
473 help="Base URL for OptiLLM server")
474 parser.add_argument("--verifier-model", type=str, default=None,
475 help="Model to use for verification (defaults to same as solver)")
476 parser.add_argument("--timeout", type=int, default=600,
477 help="Timeout in seconds for each problem")
478 parser.add_argument("--limit", type=int, default=None,
479 help="Limit number of problems (for testing)")
480 parser.add_argument("--subset", type=str, default=None,
481 help="Evaluate only 'basic' or 'advanced' subset")
482
483 args = parser.parse_args()
484
485 # Initialize OpenAI client
486 client = OpenAI(api_key="optillm", base_url=args.base_url)
487
488 # Verifier model defaults to solver model
489 verifier_model = args.verifier_model or args.model.replace("mars-", "")
490
491 # Setup results directory
492 os.makedirs("results", exist_ok=True)
493 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
494
495 # Determine if using MARS
496 is_mars = args.model.startswith("mars-")
497 approach_name = "mars" if is_mars else "baseline"
498 model_name = args.model.replace("mars-", "") if is_mars else args.model
499
500 results_file = f"results/imobench_proof_{approach_name}_{model_name.replace('/', '_')}_{timestamp}.json"
501
502 # Download dataset
503 df = download_proofbench()
504
505 # Filter by subset if specified
506 if args.subset:
507 if args.subset.lower() == 'basic':
508 df = df[df['Level'].str.contains('Basic', case=False, na=False)]
509 elif args.subset.lower() == 'advanced':
510 df = df[df['Level'].str.contains('Advanced', case=False, na=False)]
511 print(f"Filtered to {args.subset} subset")
512
513 # Limit problems if specified
514 if args.limit:
515 df = df.head(args.limit)
516
517 print(f"\nEvaluating {len(df)} ProofBench problems")
518 print(f"Model: {args.model}")
519 print(f"Approach: {approach_name}")
520 print(f"Verifier: {verifier_model}")
521 if is_mars:
522 print("MARS Config: use_thinking_tags=False, answer_extraction_mode='none'")
523 print(f"Results will be saved to: {results_file}\n")
524
525 # Evaluate each problem

Callers 1

Calls 7

download_proofbenchFunction · 0.85
verify_proofFunction · 0.85
get_llm_responseFunction · 0.70
extract_solution_qualityFunction · 0.70
save_resultFunction · 0.70
load_existing_resultsFunction · 0.70
analyze_resultsFunction · 0.70

Tested by

no test coverage detected