Load, score, and aggregate. Per-scenario scorer is picked from ``scenario.scoring_method`` when set, falling back to ``default_scoring_method``.
(
*,
trajectories_path: Path,
scenarios_paths: list[Path],
default_scoring_method: str = "llm_judge",
judge_model: str | None = None,
)
| 9 | |
| 10 | |
| 11 | def evaluate( |
| 12 | *, |
| 13 | trajectories_path: Path, |
| 14 | scenarios_paths: list[Path], |
| 15 | default_scoring_method: str = "llm_judge", |
| 16 | judge_model: str | None = None, |
| 17 | ) -> EvalReport: |
| 18 | """Load, score, and aggregate. |
| 19 | |
| 20 | Per-scenario scorer is picked from ``scenario.scoring_method`` when |
| 21 | set, falling back to ``default_scoring_method``. |
| 22 | """ |
| 23 | return Evaluator( |
| 24 | default_scorer=default_scoring_method, |
| 25 | judge_model=judge_model, |
| 26 | ).evaluate( |
| 27 | trajectories_path=trajectories_path, |
| 28 | scenarios_paths=scenarios_paths, |
| 29 | ) |