IMO25-style evaluation using rigorous two-stage verification system: 1. Detailed verification with comprehensive IMO grader prompt 2. Simple yes/no check on solution correctness This eliminates self-judgment bias and provides more accurate assessment
(problem_data: Dict, solution: str, model: str = "google/gemini-2.5-flash-lite")
| 516 | } |
| 517 | |
| 518 | def evaluate_solution(problem_data: Dict, solution: str, model: str = "google/gemini-2.5-flash-lite") -> Dict[str, any]: |
| 519 | """ |
| 520 | IMO25-style evaluation using rigorous two-stage verification system: |
| 521 | 1. Detailed verification with comprehensive IMO grader prompt |
| 522 | 2. Simple yes/no check on solution correctness |
| 523 | |
| 524 | This eliminates self-judgment bias and provides more accurate assessment |
| 525 | """ |
| 526 | logger.info(f"Running IMO25-style evaluation for problem {problem_data['id']}") |
| 527 | |
| 528 | # Use IMO25's rigorous two-stage verification with enhanced answer checking |
| 529 | imo25_verification = imo25_verify_solution(problem_data["problem"], solution, model, problem_data["id"]) |
| 530 | |
| 531 | # Extract answer for compatibility with existing code |
| 532 | answer_extraction = extract_final_answer(solution, problem_data["id"]) |
| 533 | |
| 534 | # Simple structural analysis for quality metrics |
| 535 | quality_analysis = extract_solution_quality(solution) |
| 536 | |
| 537 | # In IMO25 system, correctness is binary based on verification |
| 538 | correctness_score = 1.0 if imo25_verification["is_correct"] else 0.0 |
| 539 | |
| 540 | # Confidence based on verification success and quality |
| 541 | if imo25_verification["is_correct"] and quality_analysis["completeness_score"] > 0.7: |
| 542 | confidence = "high" |
| 543 | elif imo25_verification["is_correct"]: |
| 544 | confidence = "medium" |
| 545 | else: |
| 546 | confidence = "low" |
| 547 | |
| 548 | return { |
| 549 | # Primary binary result - this is what matters |
| 550 | "is_correct": imo25_verification["is_correct"], |
| 551 | "verdict": "Correct" if imo25_verification["is_correct"] else "Incorrect", |
| 552 | |
| 553 | # For compatibility with existing analysis code |
| 554 | "correctness_score": correctness_score, |
| 555 | "is_likely_correct": imo25_verification["is_correct"], |
| 556 | "confidence": confidence, |
| 557 | |
| 558 | # Verification details for transparency |
| 559 | "verification_details": { |
| 560 | "stage1_analysis": imo25_verification["judge_response"], |
| 561 | "stage2_check": imo25_verification["correctness_check"], |
| 562 | "errors_found": imo25_verification["errors_found"], |
| 563 | "bug_report": imo25_verification["bug_report"] if imo25_verification["bug_report"] else None |
| 564 | }, |
| 565 | |
| 566 | # Legacy compatibility for existing analysis code |
| 567 | "layer_scores": { |
| 568 | "structural_quality": quality_analysis["completeness_score"], |
| 569 | "insights_verification": 1.0 if imo25_verification["is_correct"] else 0.0, |
| 570 | "llm_judge": correctness_score, |
| 571 | "answer_extraction": answer_extraction["confidence"] |
| 572 | }, |
| 573 | "weights_used": { |
| 574 | "imo25_verification": 1.0 # Single source of truth |
| 575 | }, |
no test coverage detected