MCPcopy Index your code
hub / github.com/algorithmicsuperintelligence/optillm / main

Function main

scripts/eval_imobench_answer.py:295–403  ·  view source on GitHub ↗
()

Source from the content-addressed store, hash-verified

293
294
295def main():
296 parser = argparse.ArgumentParser(description="Evaluate on IMO-Bench AnswerBench")
297 parser.add_argument("--model", type=str, required=True,
298 help="Model to use (e.g., google/gemini-2.5-flash-preview-09-2025)")
299 parser.add_argument("--base-url", type=str, default="http://localhost:8001/v1",
300 help="Base URL for OptiLLM server")
301 parser.add_argument("--timeout", type=int, default=300,
302 help="Timeout in seconds for each problem")
303 parser.add_argument("--limit", type=int, default=None,
304 help="Limit number of problems to evaluate (for testing)")
305 parser.add_argument("--categories", type=str, default=None,
306 help="Comma-separated list of categories to evaluate (e.g., 'Algebra,Geometry')")
307
308 args = parser.parse_args()
309
310 # Initialize OpenAI client
311 client = OpenAI(api_key="optillm", base_url=args.base_url)
312
313 # Setup results directory
314 os.makedirs("results", exist_ok=True)
315 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
316
317 # Determine if using MARS approach
318 is_mars = args.model.startswith("mars-")
319 approach_name = "mars" if is_mars else "baseline"
320 model_name = args.model.replace("mars-", "") if is_mars else args.model
321
322 results_file = f"results/imobench_answer_{approach_name}_{model_name.replace('/', '_')}_{timestamp}.json"
323
324 # Download dataset
325 df = download_answerbench()
326
327 # Filter by categories if specified
328 if args.categories:
329 selected_cats = [c.strip() for c in args.categories.split(',')]
330 df = df[df['Category'].isin(selected_cats)]
331 print(f"Filtered to categories: {selected_cats}")
332
333 # Limit problems if specified
334 if args.limit:
335 df = df.head(args.limit)
336
337 print(f"\nEvaluating {len(df)} AnswerBench problems")
338 print(f"Model: {args.model}")
339 print(f"Approach: {approach_name}")
340 print(f"Results will be saved to: {results_file}\n")
341
342 # Evaluate each problem
343 for idx, row in tqdm(df.iterrows(), total=len(df), desc="Solving problems"):
344 problem_id = row.get('Problem ID', f'problem_{idx}')
345 problem_text = row['Problem']
346 ground_truth = row['Short Answer']
347 category = row.get('Category', 'Unknown')
348 subcategory = row.get('Subcategory', '')
349 difficulty = row.get('Difficulty', '')
350
351 logger.info(f"Evaluating {problem_id}: {category}")
352

Callers 1

Calls 7

download_answerbenchFunction · 0.85
get_llm_responseFunction · 0.70
compare_answersFunction · 0.70
save_resultFunction · 0.70
load_existing_resultsFunction · 0.70
analyze_resultsFunction · 0.70

Tested by

no test coverage detected