Generate a comprehensive report comparing all approaches.
(all_metrics: Dict[str, Dict[str, float]], output_dir: str, is_test_time_compute: bool = False)
| 749 | logger.info(f"Results saved to {base_filename}_*") |
| 750 | |
| 751 | def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, is_test_time_compute: bool = False): |
| 752 | """Generate a comprehensive report comparing all approaches.""" |
| 753 | report = [] |
| 754 | |
| 755 | # Check if this is the default test-time compute evaluation |
| 756 | is_default_test_time = set(all_metrics.keys()) == {"avg@5", "pass@5", "maj@5", "genselect@5"} |
| 757 | |
| 758 | # Header |
| 759 | if is_default_test_time: |
| 760 | report_title = "OptiLLM Bench Test-Time Compute Evaluation Report" |
| 761 | elif is_test_time_compute: |
| 762 | report_title = "OptiLLM Bench Test-Time Compute Scaling Report" |
| 763 | else: |
| 764 | report_title = "OptiLLM Bench Evaluation Report" |
| 765 | |
| 766 | report.append(f"# {report_title}") |
| 767 | report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") |
| 768 | |
| 769 | if is_default_test_time: |
| 770 | report.append("## Test-Time Compute Evaluation Results\n") |
| 771 | report.append("This report evaluates the potential of test-time compute with:") |
| 772 | report.append("- **avg@5**: Average success rate of 5 parallel responses") |
| 773 | report.append("- **pass@5**: Success if ANY of 5 responses is correct") |
| 774 | report.append("- **maj@5**: Majority voting with 5 candidates") |
| 775 | report.append("- **genselect@5**: Quality-based selection from 5 candidates\n") |
| 776 | report.append("All approaches use n=5 parallel generation (with sequential fallback) for fair comparison.\n") |
| 777 | elif is_test_time_compute: |
| 778 | report.append("This report evaluates test-time compute scaling approaches:") |
| 779 | report.append("- **Sequential scaling**: ThinkDeeper with varying thinking token budgets") |
| 780 | report.append("- **Parallel scaling**: Majority voting with varying k values\n") |
| 781 | |
| 782 | # Overall Results Table |
| 783 | report.append("## Overall Results") |
| 784 | headers = ["Approach", "Accuracy", "Avg Time (s)", "Total Time (s)"] |
| 785 | rows = [] |
| 786 | |
| 787 | for approach, metrics in all_metrics.items(): |
| 788 | rows.append([ |
| 789 | approach, |
| 790 | f"{metrics['accuracy']*100:.2f}%", |
| 791 | f"{metrics['average_time']:.2f}", |
| 792 | f"{metrics['total_time']:.2f}" |
| 793 | ]) |
| 794 | |
| 795 | # Convert to DataFrame for nice formatting |
| 796 | df = pd.DataFrame(rows, columns=headers) |
| 797 | report.append(df.to_markdown()) |
| 798 | |
| 799 | # Category-wise Results |
| 800 | report.append("\n## Results by Category") |
| 801 | categories = ["gsm8k", "mmlu_math", "boolq", "aqua_rat"] |
| 802 | |
| 803 | for category in categories: |
| 804 | report.append(f"\n### {category.upper()}") |
| 805 | headers = ["Approach", "Accuracy", "Avg Time (s)"] |
| 806 | rows = [] |
| 807 | |
| 808 | for approach, metrics in all_metrics.items(): |