hub / github.com/algorithmicsuperintelligence/optillm / compare_answers

Function compare_answers

scripts/eval_math500_benchmark.py:639–675 · view source on GitHub ↗

Compare the correct answer with the predicted answer.

(correct_answer: str, predicted_answer: Optional[str])

Source from the content-addressed store, hash-verified

637	return result if result else None
638
639	def compare_answers(correct_answer: str, predicted_answer: Optional[str]) -> bool:
640	"""Compare the correct answer with the predicted answer."""
641	logger.debug(f"Comparing answers - Correct: {repr(correct_answer)}, Predicted: {repr(predicted_answer)}")
642
643	if predicted_answer is None:
644	logger.debug("Predicted answer is None")
645	return False
646
647	# Try numerical comparison first
648	if numerically_equal(correct_answer, predicted_answer):
649	return True
650
651	normalized_correct = normalize_answer(correct_answer)
652	normalized_predicted = normalize_answer(predicted_answer)
653
654	logger.debug(f"Normalized answers - Correct: {repr(normalized_correct)}, Predicted: {repr(normalized_predicted)}")
655
656	# If either normalization returns None or empty string, answers don't match
657	if not normalized_correct or not normalized_predicted:
658	logger.debug("One or both normalized answers are None or empty")
659	return False
660
661	# If both answers became empty strings, they don't match
662	if normalized_correct == "" and normalized_predicted == "":
663	logger.debug("Both answers normalized to empty strings")
664	return False
665
666	# For intervals, they must match exactly (including brackets)
667	if ('\\left[' in normalized_correct or '\\left(' in normalized_correct) and \
668	('\\left[' in normalized_predicted or '\\left(' in normalized_predicted):
669	result = normalized_correct == normalized_predicted
670	logger.debug(f"Interval comparison result: {result}")
671	return result
672
673	result = normalized_correct == normalized_predicted
674	logger.debug(f"Comparison result: {result}")
675	return result
676
677	def get_llm_response(problem: str, model: str) -> str:
678	"""

Callers 1

mainFunction · 0.70

Calls 2

numerically_equalFunction · 0.85

normalize_answerFunction · 0.70

Tested by

no test coverage detected