MCPcopy
hub / github.com/IBM/AssetOpsBench / Evaluator

Class Evaluator

src/evaluation/evaluator.py:30–95  ·  view source on GitHub ↗

Run a batch of scenarios against their saved trajectories.

Source from the content-addressed store, hash-verified

28
29
30class Evaluator:
31 """Run a batch of scenarios against their saved trajectories."""
32
33 def __init__(
34 self,
35 default_scorer: str = "llm_judge",
36 judge_model: str | None = None,
37 ) -> None:
38 self.default_scorer = default_scorer
39 self.judge_model = judge_model
40
41 def evaluate(
42 self,
43 *,
44 trajectories_path: Path,
45 scenarios_paths: list[Path],
46 ) -> EvalReport:
47 scenarios = load_scenarios(scenarios_paths)
48 trajectories = load_trajectories(trajectories_path)
49
50 results: list[ScenarioResult] = []
51 for scenario, traj in join_records(scenarios, trajectories):
52 results.append(self._score_one(scenario, traj))
53
54 return build_report(results)
55
56 def _score_one(
57 self, scenario: Scenario, traj: PersistedTrajectory
58 ) -> ScenarioResult:
59 name = scenario.scoring_method or self.default_scorer
60 scorer = self._resolve(name)
61 self._validate_judge_model(name, traj)
62 trajectory_text = _trajectory_to_text(traj)
63 answer = _strip_think_blocks(traj.answer)
64 score = scorer(scenario, answer, trajectory_text)
65
66 return ScenarioResult(
67 scenario_id=scenario.id,
68 scenario_type=scenario.type,
69 run_id=traj.run_id,
70 runner=traj.runner,
71 model=traj.model,
72 question=traj.question,
73 answer=answer,
74 score=score,
75 ops=metrics_from_trajectory(traj),
76 )
77
78 @staticmethod
79 def _resolve(name: str) -> Scorer:
80 return scorer_registry.get(name)
81
82 def _validate_judge_model(self, scorer_name: str, traj: PersistedTrajectory) -> None:
83 if scorer_name != "llm_judge" or not self.judge_model:
84 return
85
86 trajectory_model = _normalize_model_id(traj.model)
87 judge_model = _normalize_model_id(self.judge_model)

Calls

no outgoing calls