Method evaluate

src/evaluation/evaluator.py:41–54 · view source on GitHub ↗

(
        self,
        *,
        trajectories_path: Path,
        scenarios_paths: list[Path],
    )

Source from the content-addressed store, hash-verified

39	self.judge_model = judge_model
40
41	def evaluate(
42	self,
43	*,
44	trajectories_path: Path,
45	scenarios_paths: list[Path],
46	) -> EvalReport:
47	scenarios = load_scenarios(scenarios_paths)
48	trajectories = load_trajectories(trajectories_path)
49
50	results: list[ScenarioResult] = []
51	for scenario, traj in join_records(scenarios, trajectories):
52	results.append(self._score_one(scenario, traj))
53
54	return build_report(results)
55
56	def _score_one(
57	self, scenario: Scenario, traj: PersistedTrajectory

mainFunction · 0.80

evaluateFunction · 0.80

test_evaluator_routes_to_default_scorerFunction · 0.80

test_evaluator_strips_think_blocks_before_scoringFunction · 0.80

test_evaluator_per_scenario_override_winsFunction · 0.80

test_evaluator_rejects_self_judging_modelFunction · 0.80

test_evaluator_rejects_self_judging_with_normalized_model_idsFunction · 0.80

test_evaluator_allows_non_llm_judge_even_with_matching_modelFunction · 0.80

_score_oneMethod · 0.95

load_scenariosFunction · 0.85

load_trajectoriesFunction · 0.85

join_recordsFunction · 0.85

build_reportFunction · 0.85

test_evaluator_routes_to_default_scorerFunction · 0.64

test_evaluator_strips_think_blocks_before_scoringFunction · 0.64

test_evaluator_per_scenario_override_winsFunction · 0.64

test_evaluator_rejects_self_judging_modelFunction · 0.64

test_evaluator_rejects_self_judging_with_normalized_model_idsFunction · 0.64

test_evaluator_allows_non_llm_judge_even_with_matching_modelFunction · 0.64