hub / github.com/algorithmicsuperintelligence/optillm / generate_report

Function generate_report

scripts/eval_optillmbench.py:751–857 · view source on GitHub ↗

Generate a comprehensive report comparing all approaches.

(all_metrics: Dict[str, Dict[str, float]], output_dir: str, is_test_time_compute: bool = False)

Source from the content-addressed store, hash-verified

749	logger.info(f"Results saved to {base_filename}_*")
750
751	def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, is_test_time_compute: bool = False):
752	"""Generate a comprehensive report comparing all approaches."""
753	report = []
754
755	# Check if this is the default test-time compute evaluation
756	is_default_test_time = set(all_metrics.keys()) == {"avg@5", "pass@5", "maj@5", "genselect@5"}
757
758	# Header
759	if is_default_test_time:
760	report_title = "OptiLLM Bench Test-Time Compute Evaluation Report"
761	elif is_test_time_compute:
762	report_title = "OptiLLM Bench Test-Time Compute Scaling Report"
763	else:
764	report_title = "OptiLLM Bench Evaluation Report"
765
766	report.append(f"# {report_title}")
767	report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
768
769	if is_default_test_time:
770	report.append("## Test-Time Compute Evaluation Results\n")
771	report.append("This report evaluates the potential of test-time compute with:")
772	report.append("- avg@5: Average success rate of 5 parallel responses")
773	report.append("- pass@5: Success if ANY of 5 responses is correct")
774	report.append("- maj@5: Majority voting with 5 candidates")
775	report.append("- genselect@5: Quality-based selection from 5 candidates\n")
776	report.append("All approaches use n=5 parallel generation (with sequential fallback) for fair comparison.\n")
777	elif is_test_time_compute:
778	report.append("This report evaluates test-time compute scaling approaches:")
779	report.append("- Sequential scaling: ThinkDeeper with varying thinking token budgets")
780	report.append("- Parallel scaling: Majority voting with varying k values\n")
781
782	# Overall Results Table
783	report.append("## Overall Results")
784	headers = ["Approach", "Accuracy", "Avg Time (s)", "Total Time (s)"]
785	rows = []
786
787	for approach, metrics in all_metrics.items():
788	rows.append([
789	approach,
790	f"{metrics['accuracy']*100:.2f}%",
791	f"{metrics['average_time']:.2f}",
792	f"{metrics['total_time']:.2f}"
793	])
794
795	# Convert to DataFrame for nice formatting
796	df = pd.DataFrame(rows, columns=headers)
797	report.append(df.to_markdown())
798
799	# Category-wise Results
800	report.append("\n## Results by Category")
801	categories = ["gsm8k", "mmlu_math", "boolq", "aqua_rat"]
802
803	for category in categories:
804	report.append(f"\n### {category.upper()}")
805	headers = ["Approach", "Accuracy", "Avg Time (s)"]
806	rows = []
807
808	for approach, metrics in all_metrics.items():

Callers 1

mainFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected