()
| 293 | |
| 294 | |
| 295 | def main(): |
| 296 | parser = argparse.ArgumentParser(description="Evaluate on IMO-Bench AnswerBench") |
| 297 | parser.add_argument("--model", type=str, required=True, |
| 298 | help="Model to use (e.g., google/gemini-2.5-flash-preview-09-2025)") |
| 299 | parser.add_argument("--base-url", type=str, default="http://localhost:8001/v1", |
| 300 | help="Base URL for OptiLLM server") |
| 301 | parser.add_argument("--timeout", type=int, default=300, |
| 302 | help="Timeout in seconds for each problem") |
| 303 | parser.add_argument("--limit", type=int, default=None, |
| 304 | help="Limit number of problems to evaluate (for testing)") |
| 305 | parser.add_argument("--categories", type=str, default=None, |
| 306 | help="Comma-separated list of categories to evaluate (e.g., 'Algebra,Geometry')") |
| 307 | |
| 308 | args = parser.parse_args() |
| 309 | |
| 310 | # Initialize OpenAI client |
| 311 | client = OpenAI(api_key="optillm", base_url=args.base_url) |
| 312 | |
| 313 | # Setup results directory |
| 314 | os.makedirs("results", exist_ok=True) |
| 315 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| 316 | |
| 317 | # Determine if using MARS approach |
| 318 | is_mars = args.model.startswith("mars-") |
| 319 | approach_name = "mars" if is_mars else "baseline" |
| 320 | model_name = args.model.replace("mars-", "") if is_mars else args.model |
| 321 | |
| 322 | results_file = f"results/imobench_answer_{approach_name}_{model_name.replace('/', '_')}_{timestamp}.json" |
| 323 | |
| 324 | # Download dataset |
| 325 | df = download_answerbench() |
| 326 | |
| 327 | # Filter by categories if specified |
| 328 | if args.categories: |
| 329 | selected_cats = [c.strip() for c in args.categories.split(',')] |
| 330 | df = df[df['Category'].isin(selected_cats)] |
| 331 | print(f"Filtered to categories: {selected_cats}") |
| 332 | |
| 333 | # Limit problems if specified |
| 334 | if args.limit: |
| 335 | df = df.head(args.limit) |
| 336 | |
| 337 | print(f"\nEvaluating {len(df)} AnswerBench problems") |
| 338 | print(f"Model: {args.model}") |
| 339 | print(f"Approach: {approach_name}") |
| 340 | print(f"Results will be saved to: {results_file}\n") |
| 341 | |
| 342 | # Evaluate each problem |
| 343 | for idx, row in tqdm(df.iterrows(), total=len(df), desc="Solving problems"): |
| 344 | problem_id = row.get('Problem ID', f'problem_{idx}') |
| 345 | problem_text = row['Problem'] |
| 346 | ground_truth = row['Short Answer'] |
| 347 | category = row.get('Category', 'Unknown') |
| 348 | subcategory = row.get('Subcategory', '') |
| 349 | difficulty = row.get('Difficulty', '') |
| 350 | |
| 351 | logger.info(f"Evaluating {problem_id}: {category}") |
| 352 |
no test coverage detected