Compare the correct answer with the predicted answer.
(correct_answer: str, predicted_answer: Optional[str])
| 637 | return result if result else None |
| 638 | |
| 639 | def compare_answers(correct_answer: str, predicted_answer: Optional[str]) -> bool: |
| 640 | """Compare the correct answer with the predicted answer.""" |
| 641 | logger.debug(f"Comparing answers - Correct: {repr(correct_answer)}, Predicted: {repr(predicted_answer)}") |
| 642 | |
| 643 | if predicted_answer is None: |
| 644 | logger.debug("Predicted answer is None") |
| 645 | return False |
| 646 | |
| 647 | # Try numerical comparison first |
| 648 | if numerically_equal(correct_answer, predicted_answer): |
| 649 | return True |
| 650 | |
| 651 | normalized_correct = normalize_answer(correct_answer) |
| 652 | normalized_predicted = normalize_answer(predicted_answer) |
| 653 | |
| 654 | logger.debug(f"Normalized answers - Correct: {repr(normalized_correct)}, Predicted: {repr(normalized_predicted)}") |
| 655 | |
| 656 | # If either normalization returns None or empty string, answers don't match |
| 657 | if not normalized_correct or not normalized_predicted: |
| 658 | logger.debug("One or both normalized answers are None or empty") |
| 659 | return False |
| 660 | |
| 661 | # If both answers became empty strings, they don't match |
| 662 | if normalized_correct == "" and normalized_predicted == "": |
| 663 | logger.debug("Both answers normalized to empty strings") |
| 664 | return False |
| 665 | |
| 666 | # For intervals, they must match exactly (including brackets) |
| 667 | if ('\\left[' in normalized_correct or '\\left(' in normalized_correct) and \ |
| 668 | ('\\left[' in normalized_predicted or '\\left(' in normalized_predicted): |
| 669 | result = normalized_correct == normalized_predicted |
| 670 | logger.debug(f"Interval comparison result: {result}") |
| 671 | return result |
| 672 | |
| 673 | result = normalized_correct == normalized_predicted |
| 674 | logger.debug(f"Comparison result: {result}") |
| 675 | return result |
| 676 | |
| 677 | def get_llm_response(problem: str, model: str) -> str: |
| 678 | """ |
no test coverage detected