MCPcopy
hub / github.com/wshobson/agents / JudgeAnalyzer

Class JudgeAnalyzer

plugins/plugin-eval/src/plugin_eval/layers/judge.py:146–321  ·  view source on GitHub ↗

Semantic skill evaluation using Claude as a judge.

Source from the content-addressed store, hash-verified

144
145
146class JudgeAnalyzer:
147 """Semantic skill evaluation using Claude as a judge."""
148
149 def __init__(self, config: JudgeConfig) -> None:
150 self.config = config
151 self._sem = asyncio.Semaphore(config.concurrency)
152
153 # ------------------------------------------------------------------
154 # Public API
155 # ------------------------------------------------------------------
156
157 async def analyze_skill(self, skill_or_dir: Path | ParsedSkill) -> LayerResult:
158 """Run all 4 assessments concurrently and return a LayerResult."""
159 skill = skill_or_dir if isinstance(skill_or_dir, ParsedSkill) else parse_skill(skill_or_dir)
160 triggering, orchestration, output_quality, scope = await asyncio.gather(
161 self.assess_triggering(skill),
162 self.assess_orchestration(skill),
163 self.assess_output_quality(skill),
164 self.assess_scope(skill),
165 )
166
167 raw_scores: dict[str, float | None] = {
168 "triggering_accuracy": _measured_score(triggering, "f1"),
169 "orchestration_fitness": _measured_score(orchestration, "score"),
170 "output_quality": _measured_score(output_quality, "score"),
171 "scope_calibration": _measured_score(scope, "score"),
172 }
173 sub_scores: dict[str, float] = {k: v for k, v in raw_scores.items() if v is not None}
174 unmeasured = sorted(k for k, v in raw_scores.items() if v is None)
175
176 # Layer score is display-only; the composite engine blends per-dimension
177 # sub_scores (omitted keys are excluded). Use the mean of measured dims.
178 score = sum(sub_scores.values()) / len(sub_scores) if sub_scores else 0.0
179
180 metadata: dict = {
181 "triggering": triggering,
182 "orchestration": orchestration,
183 "output_quality": output_quality,
184 "scope": scope,
185 "unmeasured": unmeasured,
186 }
187
188 return LayerResult(
189 layer="judge",
190 score=score,
191 sub_scores=sub_scores,
192 metadata=metadata,
193 )
194
195 # ------------------------------------------------------------------
196 # Individual assessments
197 # ------------------------------------------------------------------
198
199 async def assess_triggering(self, skill: Path | ParsedSkill) -> dict:
200 """Generate 10 synthetic prompts and classify triggering accuracy via Haiku."""
201 if isinstance(skill, Path):
202 skill = parse_skill(skill)
203 model = _resolve_model("haiku")

Calls

no outgoing calls