Run evaluation on an entire plugin directory (all skills + agents). Note: Plugin-level evaluation currently only runs Layer 1 (static). Judge and Monte Carlo require per-skill evaluation. The confidence label is always "Estimated" regardless of requested depth.
(self, plugin_dir: Path)
| 127 | ) |
| 128 | |
| 129 | def evaluate_plugin(self, plugin_dir: Path) -> PluginEvalResult: |
| 130 | """Run evaluation on an entire plugin directory (all skills + agents). |
| 131 | |
| 132 | Note: Plugin-level evaluation currently only runs Layer 1 (static). |
| 133 | Judge and Monte Carlo require per-skill evaluation. The confidence |
| 134 | label is always "Estimated" regardless of requested depth. |
| 135 | """ |
| 136 | layers: list[LayerResult] = [] |
| 137 | |
| 138 | # Layer 1: Static analysis of whole plugin |
| 139 | static_result = self._static.analyze_plugin(plugin_dir) |
| 140 | layers.append(static_result) |
| 141 | |
| 142 | # Plugin-level composite uses overall static score mapped to all |
| 143 | # static-measurable dimensions (plugin result lacks per-dimension breakdown) |
| 144 | static_overall = static_result.score |
| 145 | dimension_scores = {dim: static_overall for dim in STATIC_TO_DIMENSION.values()} |
| 146 | anti_pattern_count = len(static_result.anti_patterns) |
| 147 | composite = self._assemble_composite(dimension_scores, anti_pattern_count) |
| 148 | |
| 149 | # Plugin-level eval only has static data — always "Estimated" |
| 150 | # regardless of requested depth (judge/MC are per-skill only) |
| 151 | composite.confidence_label = Depth.QUICK.confidence_label |
| 152 | |
| 153 | return PluginEvalResult( |
| 154 | plugin_path=str(plugin_dir), |
| 155 | timestamp=datetime.now(UTC).isoformat(), |
| 156 | config=self.config, |
| 157 | layers=layers, |
| 158 | composite=composite, |
| 159 | ) |
| 160 | |
| 161 | # ------------------------------------------------------------------ |
| 162 | # Composite construction |