Run evaluation layers on a skill directory and return a result.
(self, skill_dir: Path)
| 69 | # ------------------------------------------------------------------ |
| 70 | |
| 71 | def evaluate_skill(self, skill_dir: Path) -> PluginEvalResult: |
| 72 | """Run evaluation layers on a skill directory and return a result.""" |
| 73 | skill = parse_skill(skill_dir) |
| 74 | layers: list[LayerResult] = [] |
| 75 | |
| 76 | # Layer 1: Static analysis (always runs) |
| 77 | static_result = self._static.analyze_skill(skill) |
| 78 | layers.append(static_result) |
| 79 | |
| 80 | # Layer 2: Judge (standard+ depth) |
| 81 | if self.config.depth in (Depth.STANDARD, Depth.DEEP, Depth.THOROUGH): |
| 82 | from plugin_eval.layers.judge import JudgeAnalyzer, JudgeConfig |
| 83 | |
| 84 | judge_config = JudgeConfig( |
| 85 | judges=self.config.judges, |
| 86 | concurrency=self.config.concurrency, |
| 87 | ) |
| 88 | judge = JudgeAnalyzer(judge_config) |
| 89 | |
| 90 | # Layer 3: Monte Carlo (deep+ depth) — run together with judge when both active |
| 91 | if self.config.depth in (Depth.DEEP, Depth.THOROUGH): |
| 92 | from plugin_eval.layers.monte_carlo import MonteCarloAnalyzer, MonteCarloConfig |
| 93 | |
| 94 | n_runs = self.config.monte_carlo_n or ( |
| 95 | 100 if self.config.depth == Depth.THOROUGH else 50 |
| 96 | ) |
| 97 | mc_config = MonteCarloConfig( |
| 98 | n_runs=n_runs, |
| 99 | concurrency=self.config.concurrency, |
| 100 | ) |
| 101 | mc = MonteCarloAnalyzer(mc_config) |
| 102 | |
| 103 | async def _run_llm_layers( |
| 104 | judge: JudgeAnalyzer, |
| 105 | mc: MonteCarloAnalyzer, |
| 106 | skill: ParsedSkill, |
| 107 | ) -> tuple[LayerResult, LayerResult]: |
| 108 | judge_result = await judge.analyze_skill(skill) |
| 109 | mc_result = await mc.analyze_skill(skill) |
| 110 | return judge_result, mc_result |
| 111 | |
| 112 | judge_result, mc_result = asyncio.run(_run_llm_layers(judge, mc, skill)) |
| 113 | layers.append(judge_result) |
| 114 | layers.append(mc_result) |
| 115 | else: |
| 116 | judge_result = asyncio.run(judge.analyze_skill(skill)) |
| 117 | layers.append(judge_result) |
| 118 | |
| 119 | composite = self._build_composite(layers) |
| 120 | |
| 121 | return PluginEvalResult( |
| 122 | plugin_path=str(skill_dir), |
| 123 | timestamp=datetime.now(UTC).isoformat(), |
| 124 | config=self.config, |
| 125 | layers=layers, |
| 126 | composite=composite, |
| 127 | ) |
| 128 |