hub / github.com/wshobson/agents / JudgeAnalyzer

Class JudgeAnalyzer

plugins/plugin-eval/src/plugin_eval/layers/judge.py:146–321 · view source on GitHub ↗

Semantic skill evaluation using Claude as a judge.

Source from the content-addressed store, hash-verified

144
145
146	class JudgeAnalyzer:
147	"""Semantic skill evaluation using Claude as a judge."""
148
149	def __init__(self, config: JudgeConfig) -> None:
150	self.config = config
151	self._sem = asyncio.Semaphore(config.concurrency)
152
153	# ------------------------------------------------------------------
154	# Public API
155	# ------------------------------------------------------------------
156
157	async def analyze_skill(self, skill_or_dir: Path \| ParsedSkill) -> LayerResult:
158	"""Run all 4 assessments concurrently and return a LayerResult."""
159	skill = skill_or_dir if isinstance(skill_or_dir, ParsedSkill) else parse_skill(skill_or_dir)
160	triggering, orchestration, output_quality, scope = await asyncio.gather(
161	self.assess_triggering(skill),
162	self.assess_orchestration(skill),
163	self.assess_output_quality(skill),
164	self.assess_scope(skill),
165	)
166
167	raw_scores: dict[str, float \| None] = {
168	"triggering_accuracy": _measured_score(triggering, "f1"),
169	"orchestration_fitness": _measured_score(orchestration, "score"),
170	"output_quality": _measured_score(output_quality, "score"),
171	"scope_calibration": _measured_score(scope, "score"),
172	}
173	sub_scores: dict[str, float] = {k: v for k, v in raw_scores.items() if v is not None}
174	unmeasured = sorted(k for k, v in raw_scores.items() if v is None)
175
176	# Layer score is display-only; the composite engine blends per-dimension
177	# sub_scores (omitted keys are excluded). Use the mean of measured dims.
178	score = sum(sub_scores.values()) / len(sub_scores) if sub_scores else 0.0
179
180	metadata: dict = {
181	"triggering": triggering,
182	"orchestration": orchestration,
183	"output_quality": output_quality,
184	"scope": scope,
185	"unmeasured": unmeasured,
186	}
187
188	return LayerResult(
189	layer="judge",
190	score=score,
191	sub_scores=sub_scores,
192	metadata=metadata,
193	)
194
195	# ------------------------------------------------------------------
196	# Individual assessments
197	# ------------------------------------------------------------------
198
199	async def assess_triggering(self, skill: Path \| ParsedSkill) -> dict:
200	"""Generate 10 synthetic prompts and classify triggering accuracy via Haiku."""
201	if isinstance(skill, Path):
202	skill = parse_skill(skill)
203	model = _resolve_model("haiku")

Callers 6

evaluate_skillMethod · 0.90

test_assess_triggeringMethod · 0.90

test_assess_orchestrationMethod · 0.90

test_full_analysisMethod · 0.90

test_all_unmeasured_yields_empty_sub_scoresMethod · 0.90

test_partial_measurement_omits_only_failedMethod · 0.90

Calls

no outgoing calls

Tested by 5

test_assess_triggeringMethod · 0.72

test_assess_orchestrationMethod · 0.72

test_full_analysisMethod · 0.72

test_all_unmeasured_yields_empty_sub_scoresMethod · 0.72

test_partial_measurement_omits_only_failedMethod · 0.72