MCPcopy
hub / github.com/wshobson/agents / evaluate_skill

Method evaluate_skill

plugins/plugin-eval/src/plugin_eval/engine.py:71–127  ·  view source on GitHub ↗

Run evaluation layers on a skill directory and return a result.

(self, skill_dir: Path)

Source from the content-addressed store, hash-verified

69 # ------------------------------------------------------------------
70
71 def evaluate_skill(self, skill_dir: Path) -> PluginEvalResult:
72 """Run evaluation layers on a skill directory and return a result."""
73 skill = parse_skill(skill_dir)
74 layers: list[LayerResult] = []
75
76 # Layer 1: Static analysis (always runs)
77 static_result = self._static.analyze_skill(skill)
78 layers.append(static_result)
79
80 # Layer 2: Judge (standard+ depth)
81 if self.config.depth in (Depth.STANDARD, Depth.DEEP, Depth.THOROUGH):
82 from plugin_eval.layers.judge import JudgeAnalyzer, JudgeConfig
83
84 judge_config = JudgeConfig(
85 judges=self.config.judges,
86 concurrency=self.config.concurrency,
87 )
88 judge = JudgeAnalyzer(judge_config)
89
90 # Layer 3: Monte Carlo (deep+ depth) — run together with judge when both active
91 if self.config.depth in (Depth.DEEP, Depth.THOROUGH):
92 from plugin_eval.layers.monte_carlo import MonteCarloAnalyzer, MonteCarloConfig
93
94 n_runs = self.config.monte_carlo_n or (
95 100 if self.config.depth == Depth.THOROUGH else 50
96 )
97 mc_config = MonteCarloConfig(
98 n_runs=n_runs,
99 concurrency=self.config.concurrency,
100 )
101 mc = MonteCarloAnalyzer(mc_config)
102
103 async def _run_llm_layers(
104 judge: JudgeAnalyzer,
105 mc: MonteCarloAnalyzer,
106 skill: ParsedSkill,
107 ) -> tuple[LayerResult, LayerResult]:
108 judge_result = await judge.analyze_skill(skill)
109 mc_result = await mc.analyze_skill(skill)
110 return judge_result, mc_result
111
112 judge_result, mc_result = asyncio.run(_run_llm_layers(judge, mc, skill))
113 layers.append(judge_result)
114 layers.append(mc_result)
115 else:
116 judge_result = asyncio.run(judge.analyze_skill(skill))
117 layers.append(judge_result)
118
119 composite = self._build_composite(layers)
120
121 return PluginEvalResult(
122 plugin_path=str(skill_dir),
123 timestamp=datetime.now(UTC).isoformat(),
124 config=self.config,
125 layers=layers,
126 composite=composite,
127 )
128

Calls 8

analyze_skillMethod · 0.95
_build_compositeMethod · 0.95
parse_skillFunction · 0.90
JudgeConfigClass · 0.90
JudgeAnalyzerClass · 0.90
MonteCarloConfigClass · 0.90
MonteCarloAnalyzerClass · 0.90
PluginEvalResultClass · 0.90