| 104 | |
| 105 | |
| 106 | def build_report(results: list[ScenarioResult]) -> EvalReport: |
| 107 | total = len(results) |
| 108 | passed = sum(1 for r in results if r.score.passed) |
| 109 | |
| 110 | by_type: dict[str, list[ScenarioResult]] = defaultdict(list) |
| 111 | for r in results: |
| 112 | by_type[r.scenario_type or "unknown"].append(r) |
| 113 | |
| 114 | breakdown: dict[str, TypeBreakdown] = {} |
| 115 | for stype, items in by_type.items(): |
| 116 | n = len(items) |
| 117 | p = sum(1 for r in items if r.score.passed) |
| 118 | breakdown[stype] = TypeBreakdown( |
| 119 | total=n, |
| 120 | passed=p, |
| 121 | pass_rate=round(p / n, 4) if n else 0.0, |
| 122 | ) |
| 123 | |
| 124 | return EvalReport( |
| 125 | generated_at=_dt.datetime.now(_dt.timezone.utc).isoformat(), |
| 126 | runners=sorted({r.runner for r in results}), |
| 127 | models=sorted({r.model for r in results}), |
| 128 | totals={ |
| 129 | "scenarios": total, |
| 130 | "scored": total, |
| 131 | "passed": passed, |
| 132 | "pass_rate": round(passed / total, 4) if total else 0.0, |
| 133 | }, |
| 134 | by_scenario_type=breakdown, |
| 135 | ops=aggregate_ops(results), |
| 136 | score_summary=_aggregate_score_summary(results), |
| 137 | results=results, |
| 138 | ) |
| 139 | |
| 140 | |
| 141 | def write_report(report: EvalReport, output: Path) -> Path: |