MCPcopy
hub / github.com/algorithmicsuperintelligence/optillm / generate_report

Function generate_report

scripts/eval_optillmbench.py:751–857  ·  view source on GitHub ↗

Generate a comprehensive report comparing all approaches.

(all_metrics: Dict[str, Dict[str, float]], output_dir: str, is_test_time_compute: bool = False)

Source from the content-addressed store, hash-verified

749 logger.info(f"Results saved to {base_filename}_*")
750
751def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, is_test_time_compute: bool = False):
752 """Generate a comprehensive report comparing all approaches."""
753 report = []
754
755 # Check if this is the default test-time compute evaluation
756 is_default_test_time = set(all_metrics.keys()) == {"avg@5", "pass@5", "maj@5", "genselect@5"}
757
758 # Header
759 if is_default_test_time:
760 report_title = "OptiLLM Bench Test-Time Compute Evaluation Report"
761 elif is_test_time_compute:
762 report_title = "OptiLLM Bench Test-Time Compute Scaling Report"
763 else:
764 report_title = "OptiLLM Bench Evaluation Report"
765
766 report.append(f"# {report_title}")
767 report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
768
769 if is_default_test_time:
770 report.append("## Test-Time Compute Evaluation Results\n")
771 report.append("This report evaluates the potential of test-time compute with:")
772 report.append("- **avg@5**: Average success rate of 5 parallel responses")
773 report.append("- **pass@5**: Success if ANY of 5 responses is correct")
774 report.append("- **maj@5**: Majority voting with 5 candidates")
775 report.append("- **genselect@5**: Quality-based selection from 5 candidates\n")
776 report.append("All approaches use n=5 parallel generation (with sequential fallback) for fair comparison.\n")
777 elif is_test_time_compute:
778 report.append("This report evaluates test-time compute scaling approaches:")
779 report.append("- **Sequential scaling**: ThinkDeeper with varying thinking token budgets")
780 report.append("- **Parallel scaling**: Majority voting with varying k values\n")
781
782 # Overall Results Table
783 report.append("## Overall Results")
784 headers = ["Approach", "Accuracy", "Avg Time (s)", "Total Time (s)"]
785 rows = []
786
787 for approach, metrics in all_metrics.items():
788 rows.append([
789 approach,
790 f"{metrics['accuracy']*100:.2f}%",
791 f"{metrics['average_time']:.2f}",
792 f"{metrics['total_time']:.2f}"
793 ])
794
795 # Convert to DataFrame for nice formatting
796 df = pd.DataFrame(rows, columns=headers)
797 report.append(df.to_markdown())
798
799 # Category-wise Results
800 report.append("\n## Results by Category")
801 categories = ["gsm8k", "mmlu_math", "boolq", "aqua_rat"]
802
803 for category in categories:
804 report.append(f"\n### {category.upper()}")
805 headers = ["Approach", "Accuracy", "Avg Time (s)"]
806 rows = []
807
808 for approach, metrics in all_metrics.items():

Callers 1

mainFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected