hub / github.com/wshobson/agents / evaluate_skill

Method evaluate_skill

plugins/plugin-eval/src/plugin_eval/engine.py:71–127 · view source on GitHub ↗

Run evaluation layers on a skill directory and return a result.

(self, skill_dir: Path)

Source from the content-addressed store, hash-verified

69	# ------------------------------------------------------------------
70
71	def evaluate_skill(self, skill_dir: Path) -> PluginEvalResult:
72	"""Run evaluation layers on a skill directory and return a result."""
73	skill = parse_skill(skill_dir)
74	layers: list[LayerResult] = []
75
76	# Layer 1: Static analysis (always runs)
77	static_result = self._static.analyze_skill(skill)
78	layers.append(static_result)
79
80	# Layer 2: Judge (standard+ depth)
81	if self.config.depth in (Depth.STANDARD, Depth.DEEP, Depth.THOROUGH):
82	from plugin_eval.layers.judge import JudgeAnalyzer, JudgeConfig
83
84	judge_config = JudgeConfig(
85	judges=self.config.judges,
86	concurrency=self.config.concurrency,
87	)
88	judge = JudgeAnalyzer(judge_config)
89
90	# Layer 3: Monte Carlo (deep+ depth) — run together with judge when both active
91	if self.config.depth in (Depth.DEEP, Depth.THOROUGH):
92	from plugin_eval.layers.monte_carlo import MonteCarloAnalyzer, MonteCarloConfig
93
94	n_runs = self.config.monte_carlo_n or (
95	100 if self.config.depth == Depth.THOROUGH else 50
96	)
97	mc_config = MonteCarloConfig(
98	n_runs=n_runs,
99	concurrency=self.config.concurrency,
100	)
101	mc = MonteCarloAnalyzer(mc_config)
102
103	async def _run_llm_layers(
104	judge: JudgeAnalyzer,
105	mc: MonteCarloAnalyzer,
106	skill: ParsedSkill,
107	) -> tuple[LayerResult, LayerResult]:
108	judge_result = await judge.analyze_skill(skill)
109	mc_result = await mc.analyze_skill(skill)
110	return judge_result, mc_result
111
112	judge_result, mc_result = asyncio.run(_run_llm_layers(judge, mc, skill))
113	layers.append(judge_result)
114	layers.append(mc_result)
115	else:
116	judge_result = asyncio.run(judge.analyze_skill(skill))
117	layers.append(judge_result)
118
119	composite = self._build_composite(layers)
120
121	return PluginEvalResult(
122	plugin_path=str(skill_dir),
123	timestamp=datetime.now(UTC).isoformat(),
124	config=self.config,
125	layers=layers,
126	composite=composite,
127	)
128

Callers 10

_run_scoreFunction · 0.95

compareFunction · 0.95

test_json_outputMethod · 0.95

test_markdown_outputMethod · 0.95

test_effective_depth_matches_layers_runMethod · 0.95

test_markdown_shows_no_warning_when_depth_was_honoredMethod · 0.95

test_quick_eval_skillMethod · 0.95

test_composite_score_within_boundsMethod · 0.95

test_score_real_skill_quickMethod · 0.95

test_score_agent_teams_quickMethod · 0.95

Calls 8

analyze_skillMethod · 0.95

_build_compositeMethod · 0.95

parse_skillFunction · 0.90

JudgeConfigClass · 0.90

JudgeAnalyzerClass · 0.90

MonteCarloConfigClass · 0.90

MonteCarloAnalyzerClass · 0.90

PluginEvalResultClass · 0.90

Tested by 8

test_json_outputMethod · 0.76

test_markdown_outputMethod · 0.76

test_effective_depth_matches_layers_runMethod · 0.76

test_markdown_shows_no_warning_when_depth_was_honoredMethod · 0.76

test_quick_eval_skillMethod · 0.76

test_composite_score_within_boundsMethod · 0.76

test_score_real_skill_quickMethod · 0.76

test_score_agent_teams_quickMethod · 0.76