Analyze and print summary statistics of the results. Args: results (List[Dict]): List of evaluation results n (int): Number of attempts per problem analyze_thoughts (bool): Whether to analyze thinking patterns analyze_logits (bool): Whether to analyze to
(results: List[Dict], n: int, analyze_thoughts: bool = False, analyze_logits: bool = False)
| 473 | return max(int(r.get('index', -1)) for r in results) |
| 474 | |
| 475 | def analyze_results(results: List[Dict], n: int, analyze_thoughts: bool = False, analyze_logits: bool = False): |
| 476 | """ |
| 477 | Analyze and print summary statistics of the results. |
| 478 | |
| 479 | Args: |
| 480 | results (List[Dict]): List of evaluation results |
| 481 | n (int): Number of attempts per problem |
| 482 | analyze_thoughts (bool): Whether to analyze thinking patterns |
| 483 | analyze_logits (bool): Whether to analyze token probabilities |
| 484 | """ |
| 485 | total = len(results) |
| 486 | correct = sum(1 for r in results if r['is_correct']) |
| 487 | accuracy = correct / total if total > 0 else 0 |
| 488 | |
| 489 | print("\n=== Results Summary ===") |
| 490 | print(f"Evaluation mode: pass@{n}") |
| 491 | print(f"Total problems: {total}") |
| 492 | print(f"Correct answers: {correct}") |
| 493 | print(f"Accuracy: {accuracy:.2%}") |
| 494 | |
| 495 | # Calculate attempt statistics |
| 496 | successful_attempts = [r['first_correct_attempt'] for r in results if r['is_correct']] |
| 497 | if successful_attempts: |
| 498 | avg_attempts = sum(successful_attempts) / len(successful_attempts) |
| 499 | print(f"\nFor correct solutions:") |
| 500 | print(f"Average attempts needed: {avg_attempts:.2f}") |
| 501 | print(f"Attempt distribution:") |
| 502 | for i in range(1, n + 1): |
| 503 | count = sum(1 for x in successful_attempts if x == i) |
| 504 | print(f" Attempt {i}: {count} problems") |
| 505 | |
| 506 | if analyze_thoughts: |
| 507 | print("\n=== Thinking Pattern Analysis ===") |
| 508 | |
| 509 | # Collect metrics about thinking patterns for correct vs incorrect attempts |
| 510 | correct_attempts = [] |
| 511 | incorrect_attempts = [] |
| 512 | |
| 513 | for result in results: |
| 514 | for attempt in result['attempts']: |
| 515 | if 'thought_analysis' in attempt: |
| 516 | if result['is_correct'] and attempt['predicted_answer'] == result['correct_answer']: |
| 517 | correct_attempts.append(attempt) |
| 518 | else: |
| 519 | incorrect_attempts.append(attempt) |
| 520 | |
| 521 | # Function to calculate statistics for a group of attempts |
| 522 | def calc_stats(attempts): |
| 523 | if not attempts: |
| 524 | return { |
| 525 | "count": 0, |
| 526 | "avg_thinking_tokens": 0, |
| 527 | "avg_thought_transitions": 0, |
| 528 | "transition_usage": {phrase: 0 for phrase in THOUGHT_TRANSITIONS}, |
| 529 | "has_think_tags_pct": 0 |
| 530 | } |
| 531 | |
| 532 | thinking_tokens = [a['thought_analysis']['thinking_tokens'] for a in attempts] |
no test coverage detected