MCPcopy
hub / github.com/algorithmicsuperintelligence/optillm / verify_proof

Function verify_proof

scripts/eval_imobench_proof.py:103–165  ·  view source on GitHub ↗

Verify a proof using IMO25-style two-stage verification Returns score on 0-7 scale and detailed assessment

(problem: str, solution: str, grading_guidelines: str, model: str, client: OpenAI)

Source from the content-addressed store, hash-verified

101
102
103def verify_proof(problem: str, solution: str, grading_guidelines: str, model: str, client: OpenAI) -> Dict:
104 """
105 Verify a proof using IMO25-style two-stage verification
106 Returns score on 0-7 scale and detailed assessment
107 """
108 try:
109 # Format verification prompt
110 verification_text = VERIFICATION_PROMPT.format(
111 problem=problem,
112 solution=solution
113 )
114
115 # Add grading guidelines if available
116 if grading_guidelines and pd.notna(grading_guidelines):
117 verification_text += f"\n\n**Grading Guidelines:**\n{grading_guidelines}"
118
119 # Get verification response
120 response = client.chat.completions.create(
121 model=model,
122 messages=[
123 {"role": "system", "content": "You are an expert IMO grader. Provide rigorous assessment."},
124 {"role": "user", "content": verification_text}
125 ],
126 max_tokens=4000,
127 temperature=0.1
128 )
129
130 verification_response = response.choices[0].message.content.strip()
131
132 # Extract score
133 score_match = re.search(r'\*\*SCORE:\*\*\s*(\d+)', verification_response)
134 score = int(score_match.group(1)) if score_match else 0
135
136 # Extract verdict
137 verdict_match = re.search(r'\*\*VERDICT:\*\*\s*([^\n]+)', verification_response)
138 verdict = verdict_match.group(1).strip() if verdict_match else "Unknown"
139
140 # Determine if correct (7 points = full marks)
141 is_correct = (score == 7)
142
143 # Check for critical errors
144 errors_match = re.search(r'\*\*CRITICAL ERRORS:\*\*\s*([^\n]+)', verification_response)
145 has_critical_errors = errors_match and "None" not in errors_match.group(1) if errors_match else False
146
147 return {
148 "score": score,
149 "verdict": verdict,
150 "is_correct": is_correct,
151 "has_critical_errors": has_critical_errors,
152 "verification_response": verification_response,
153 "success": True
154 }
155
156 except Exception as e:
157 logger.error(f"Error in proof verification: {e}")
158 return {
159 "score": 0,
160 "verdict": "Error",

Callers 1

mainFunction · 0.85

Calls 2

createMethod · 0.45
searchMethod · 0.45

Tested by

no test coverage detected