Semantic skill evaluation using Claude as a judge.
| 144 | |
| 145 | |
| 146 | class JudgeAnalyzer: |
| 147 | """Semantic skill evaluation using Claude as a judge.""" |
| 148 | |
| 149 | def __init__(self, config: JudgeConfig) -> None: |
| 150 | self.config = config |
| 151 | self._sem = asyncio.Semaphore(config.concurrency) |
| 152 | |
| 153 | # ------------------------------------------------------------------ |
| 154 | # Public API |
| 155 | # ------------------------------------------------------------------ |
| 156 | |
| 157 | async def analyze_skill(self, skill_or_dir: Path | ParsedSkill) -> LayerResult: |
| 158 | """Run all 4 assessments concurrently and return a LayerResult.""" |
| 159 | skill = skill_or_dir if isinstance(skill_or_dir, ParsedSkill) else parse_skill(skill_or_dir) |
| 160 | triggering, orchestration, output_quality, scope = await asyncio.gather( |
| 161 | self.assess_triggering(skill), |
| 162 | self.assess_orchestration(skill), |
| 163 | self.assess_output_quality(skill), |
| 164 | self.assess_scope(skill), |
| 165 | ) |
| 166 | |
| 167 | raw_scores: dict[str, float | None] = { |
| 168 | "triggering_accuracy": _measured_score(triggering, "f1"), |
| 169 | "orchestration_fitness": _measured_score(orchestration, "score"), |
| 170 | "output_quality": _measured_score(output_quality, "score"), |
| 171 | "scope_calibration": _measured_score(scope, "score"), |
| 172 | } |
| 173 | sub_scores: dict[str, float] = {k: v for k, v in raw_scores.items() if v is not None} |
| 174 | unmeasured = sorted(k for k, v in raw_scores.items() if v is None) |
| 175 | |
| 176 | # Layer score is display-only; the composite engine blends per-dimension |
| 177 | # sub_scores (omitted keys are excluded). Use the mean of measured dims. |
| 178 | score = sum(sub_scores.values()) / len(sub_scores) if sub_scores else 0.0 |
| 179 | |
| 180 | metadata: dict = { |
| 181 | "triggering": triggering, |
| 182 | "orchestration": orchestration, |
| 183 | "output_quality": output_quality, |
| 184 | "scope": scope, |
| 185 | "unmeasured": unmeasured, |
| 186 | } |
| 187 | |
| 188 | return LayerResult( |
| 189 | layer="judge", |
| 190 | score=score, |
| 191 | sub_scores=sub_scores, |
| 192 | metadata=metadata, |
| 193 | ) |
| 194 | |
| 195 | # ------------------------------------------------------------------ |
| 196 | # Individual assessments |
| 197 | # ------------------------------------------------------------------ |
| 198 | |
| 199 | async def assess_triggering(self, skill: Path | ParsedSkill) -> dict: |
| 200 | """Generate 10 synthetic prompts and classify triggering accuracy via Haiku.""" |
| 201 | if isinstance(skill, Path): |
| 202 | skill = parse_skill(skill) |
| 203 | model = _resolve_model("haiku") |
no outgoing calls