hub / github.com/algorithmicsuperintelligence/optillm / verify_proof

Function verify_proof

scripts/eval_imobench_proof.py:103–165 · view source on GitHub ↗

Verify a proof using IMO25-style two-stage verification Returns score on 0-7 scale and detailed assessment

(problem: str, solution: str, grading_guidelines: str, model: str, client: OpenAI)

Source from the content-addressed store, hash-verified

101
102
103	def verify_proof(problem: str, solution: str, grading_guidelines: str, model: str, client: OpenAI) -> Dict:
104	"""
105	Verify a proof using IMO25-style two-stage verification
106	Returns score on 0-7 scale and detailed assessment
107	"""
108	try:
109	# Format verification prompt
110	verification_text = VERIFICATION_PROMPT.format(
111	problem=problem,
112	solution=solution
113	)
114
115	# Add grading guidelines if available
116	if grading_guidelines and pd.notna(grading_guidelines):
117	verification_text += f"\n\nGrading Guidelines:\n{grading_guidelines}"
118
119	# Get verification response
120	response = client.chat.completions.create(
121	model=model,
122	messages=[
123	{"role": "system", "content": "You are an expert IMO grader. Provide rigorous assessment."},
124	{"role": "user", "content": verification_text}
125	],
126	max_tokens=4000,
127	temperature=0.1
128	)
129
130	verification_response = response.choices[0].message.content.strip()
131
132	# Extract score
133	score_match = re.search(r'\\SCORE:\\\s*(\d+)', verification_response)
134	score = int(score_match.group(1)) if score_match else 0
135
136	# Extract verdict
137	verdict_match = re.search(r'\\VERDICT:\\\s*([^\n]+)', verification_response)
138	verdict = verdict_match.group(1).strip() if verdict_match else "Unknown"
139
140	# Determine if correct (7 points = full marks)
141	is_correct = (score == 7)
142
143	# Check for critical errors
144	errors_match = re.search(r'\\CRITICAL ERRORS:\\\s*([^\n]+)', verification_response)
145	has_critical_errors = errors_match and "None" not in errors_match.group(1) if errors_match else False
146
147	return {
148	"score": score,
149	"verdict": verdict,
150	"is_correct": is_correct,
151	"has_critical_errors": has_critical_errors,
152	"verification_response": verification_response,
153	"success": True
154	}
155
156	except Exception as e:
157	logger.error(f"Error in proof verification: {e}")
158	return {
159	"score": 0,
160	"verdict": "Error",

Callers 1

mainFunction · 0.85

Calls 2

createMethod · 0.45

searchMethod · 0.45

Tested by

no test coverage detected