MCPcopy Index your code
hub / github.com/algorithmicsuperintelligence/optillm / compare_answers

Function compare_answers

scripts/eval_math500_benchmark.py:639–675  ·  view source on GitHub ↗

Compare the correct answer with the predicted answer.

(correct_answer: str, predicted_answer: Optional[str])

Source from the content-addressed store, hash-verified

637 return result if result else None
638
639def compare_answers(correct_answer: str, predicted_answer: Optional[str]) -> bool:
640 """Compare the correct answer with the predicted answer."""
641 logger.debug(f"Comparing answers - Correct: {repr(correct_answer)}, Predicted: {repr(predicted_answer)}")
642
643 if predicted_answer is None:
644 logger.debug("Predicted answer is None")
645 return False
646
647 # Try numerical comparison first
648 if numerically_equal(correct_answer, predicted_answer):
649 return True
650
651 normalized_correct = normalize_answer(correct_answer)
652 normalized_predicted = normalize_answer(predicted_answer)
653
654 logger.debug(f"Normalized answers - Correct: {repr(normalized_correct)}, Predicted: {repr(normalized_predicted)}")
655
656 # If either normalization returns None or empty string, answers don't match
657 if not normalized_correct or not normalized_predicted:
658 logger.debug("One or both normalized answers are None or empty")
659 return False
660
661 # If both answers became empty strings, they don't match
662 if normalized_correct == "" and normalized_predicted == "":
663 logger.debug("Both answers normalized to empty strings")
664 return False
665
666 # For intervals, they must match exactly (including brackets)
667 if ('\\left[' in normalized_correct or '\\left(' in normalized_correct) and \
668 ('\\left[' in normalized_predicted or '\\left(' in normalized_predicted):
669 result = normalized_correct == normalized_predicted
670 logger.debug(f"Interval comparison result: {result}")
671 return result
672
673 result = normalized_correct == normalized_predicted
674 logger.debug(f"Comparison result: {result}")
675 return result
676
677def get_llm_response(problem: str, model: str) -> str:
678 """

Callers 1

mainFunction · 0.70

Calls 2

numerically_equalFunction · 0.85
normalize_answerFunction · 0.70

Tested by

no test coverage detected