Run a batch of scenarios against their saved trajectories.
| 28 | |
| 29 | |
| 30 | class Evaluator: |
| 31 | """Run a batch of scenarios against their saved trajectories.""" |
| 32 | |
| 33 | def __init__( |
| 34 | self, |
| 35 | default_scorer: str = "llm_judge", |
| 36 | judge_model: str | None = None, |
| 37 | ) -> None: |
| 38 | self.default_scorer = default_scorer |
| 39 | self.judge_model = judge_model |
| 40 | |
| 41 | def evaluate( |
| 42 | self, |
| 43 | *, |
| 44 | trajectories_path: Path, |
| 45 | scenarios_paths: list[Path], |
| 46 | ) -> EvalReport: |
| 47 | scenarios = load_scenarios(scenarios_paths) |
| 48 | trajectories = load_trajectories(trajectories_path) |
| 49 | |
| 50 | results: list[ScenarioResult] = [] |
| 51 | for scenario, traj in join_records(scenarios, trajectories): |
| 52 | results.append(self._score_one(scenario, traj)) |
| 53 | |
| 54 | return build_report(results) |
| 55 | |
| 56 | def _score_one( |
| 57 | self, scenario: Scenario, traj: PersistedTrajectory |
| 58 | ) -> ScenarioResult: |
| 59 | name = scenario.scoring_method or self.default_scorer |
| 60 | scorer = self._resolve(name) |
| 61 | self._validate_judge_model(name, traj) |
| 62 | trajectory_text = _trajectory_to_text(traj) |
| 63 | answer = _strip_think_blocks(traj.answer) |
| 64 | score = scorer(scenario, answer, trajectory_text) |
| 65 | |
| 66 | return ScenarioResult( |
| 67 | scenario_id=scenario.id, |
| 68 | scenario_type=scenario.type, |
| 69 | run_id=traj.run_id, |
| 70 | runner=traj.runner, |
| 71 | model=traj.model, |
| 72 | question=traj.question, |
| 73 | answer=answer, |
| 74 | score=score, |
| 75 | ops=metrics_from_trajectory(traj), |
| 76 | ) |
| 77 | |
| 78 | @staticmethod |
| 79 | def _resolve(name: str) -> Scorer: |
| 80 | return scorer_registry.get(name) |
| 81 | |
| 82 | def _validate_judge_model(self, scorer_name: str, traj: PersistedTrajectory) -> None: |
| 83 | if scorer_name != "llm_judge" or not self.judge_model: |
| 84 | return |
| 85 | |
| 86 | trajectory_model = _normalize_model_id(traj.model) |
| 87 | judge_model = _normalize_model_id(self.judge_model) |
no outgoing calls