Main evaluation function.
(model: str)
| 743 | print("---") |
| 744 | |
| 745 | def main(model: str): |
| 746 | """Main evaluation function.""" |
| 747 | os.makedirs("results", exist_ok=True) |
| 748 | results_file = f"evaluation_results_math500_{model.replace('/', '_')}.json" |
| 749 | |
| 750 | dataset = load_math500_dataset() |
| 751 | existing_results = load_existing_results(results_file) |
| 752 | |
| 753 | # Create a set of already processed indexes for efficient lookup |
| 754 | processed_indexes = {result['index'] for result in existing_results} |
| 755 | |
| 756 | for idx, item in enumerate(tqdm(dataset, desc="Evaluating problems")): |
| 757 | # Skip if this index has already been processed |
| 758 | if idx in processed_indexes: |
| 759 | continue |
| 760 | |
| 761 | problem_text = item['problem'] |
| 762 | correct_answer = item['answer'] |
| 763 | |
| 764 | # Get model's response |
| 765 | response = get_llm_response(problem_text, model) |
| 766 | predicted_answer = extract_answer(response) |
| 767 | |
| 768 | # Compare answers using the new comparison function |
| 769 | is_correct = compare_answers(correct_answer, predicted_answer) |
| 770 | |
| 771 | result = { |
| 772 | "index": idx, |
| 773 | "problem": problem_text, |
| 774 | "response": response, |
| 775 | "correct_answer": correct_answer, |
| 776 | "predicted_answer": predicted_answer, |
| 777 | "is_correct": is_correct |
| 778 | } |
| 779 | save_result(results_file, result) |
| 780 | |
| 781 | final_results = load_existing_results(results_file) |
| 782 | analyze_results(final_results) |
| 783 | |
| 784 | if __name__ == "__main__": |
| 785 | parser = argparse.ArgumentParser(description="Evaluate LLM performance on MATH-500 problems") |
no test coverage detected