Main evaluation function that handles gaps in processed indexes.
(model: str, n_attempts: int, year: int = 2024, analyze_thoughts: bool = False, analyze_logits: bool = False, test_time_compute: bool = False, approach_name: str = None, extra_body: dict = None)
| 818 | return response_id |
| 819 | |
| 820 | def main(model: str, n_attempts: int, year: int = 2024, analyze_thoughts: bool = False, analyze_logits: bool = False, test_time_compute: bool = False, approach_name: str = None, extra_body: dict = None): |
| 821 | """Main evaluation function that handles gaps in processed indexes.""" |
| 822 | os.makedirs("results", exist_ok=True) |
| 823 | |
| 824 | # Create suffix based on analysis flags |
| 825 | suffix_parts = [] |
| 826 | if year != 2024: |
| 827 | suffix_parts.append(f"aime{year}") |
| 828 | if analyze_thoughts: |
| 829 | suffix_parts.append("thought_analysis") |
| 830 | if analyze_logits: |
| 831 | suffix_parts.append("logit_analysis") |
| 832 | if approach_name: |
| 833 | suffix_parts.append(approach_name) |
| 834 | |
| 835 | suffix = "_" + "_".join(suffix_parts) if suffix_parts else "" |
| 836 | results_file = f"results/evaluation_results_{model.replace('/', '_')}_pass_at_{n_attempts}{suffix}.json" |
| 837 | |
| 838 | dataset = load_dataset_by_year(year) |
| 839 | existing_results = load_existing_results(results_file) |
| 840 | |
| 841 | # Create a set of already processed indexes for efficient lookup |
| 842 | processed_indexes = {result['index'] for result in existing_results} |
| 843 | |
| 844 | for _, item in enumerate(tqdm(dataset, desc="Evaluating problems")): |
| 845 | id = int(item['id']) |
| 846 | # Skip if this index has already been processed |
| 847 | if id in processed_indexes: |
| 848 | continue |
| 849 | |
| 850 | problem_text = item['problem'] |
| 851 | correct_answer = int(item['answer']) |
| 852 | |
| 853 | print(f"\n🔬 Processing Problem {id}: {problem_text[:100]}...") |
| 854 | print(f" Expected answer: {correct_answer}") |
| 855 | if extra_body and 'optillm_approach' in extra_body: |
| 856 | print(f" Using approach: {extra_body['optillm_approach']}") |
| 857 | |
| 858 | # Make n attempts for each problem |
| 859 | attempts = make_n_attempts(problem_text, model, n_attempts, analyze_thoughts, analyze_logits, extra_body) |
| 860 | is_correct, first_correct = evaluate_pass_at_n(attempts, correct_answer) |
| 861 | |
| 862 | # Report result |
| 863 | predicted_answers = [attempt.get('predicted_answer') for attempt in attempts] |
| 864 | print(f" Predicted: {predicted_answers}") |
| 865 | if is_correct: |
| 866 | print(f" ✅ CORRECT!") |
| 867 | else: |
| 868 | print(f" ❌ Incorrect") |
| 869 | |
| 870 | result = { |
| 871 | "index": id, |
| 872 | "problem": problem_text, |
| 873 | "attempts": attempts, |
| 874 | "correct_answer": correct_answer, |
| 875 | "is_correct": is_correct, |
| 876 | "first_correct_attempt": first_correct |
| 877 | } |
no test coverage detected