Analyze and print summary statistics of the results. Args: results (list[Dict]): List of evaluation results
(results: list[Dict])
| 719 | json.dump(results, f, indent=2) |
| 720 | |
| 721 | def analyze_results(results: list[Dict]): |
| 722 | """ |
| 723 | Analyze and print summary statistics of the results. |
| 724 | |
| 725 | Args: |
| 726 | results (list[Dict]): List of evaluation results |
| 727 | """ |
| 728 | total = len(results) |
| 729 | correct = sum(1 for r in results if r['is_correct']) |
| 730 | accuracy = correct / total if total > 0 else 0 |
| 731 | |
| 732 | print("\n=== Results Summary ===") |
| 733 | print(f"Total problems: {total}") |
| 734 | print(f"Correct answers: {correct}") |
| 735 | print(f"Accuracy: {accuracy:.2%}") |
| 736 | |
| 737 | print("\n=== Incorrect Problems ===") |
| 738 | for r in results: |
| 739 | if not r['is_correct']: |
| 740 | print(f"Problem {r['index']}:") |
| 741 | print(f"Expected: {r['correct_answer']}") |
| 742 | print(f"Predicted: {r['predicted_answer']}") |
| 743 | print("---") |
| 744 | |
| 745 | def main(model: str): |
| 746 | """Main evaluation function.""" |