| 33 | # ------------------------------------------------------------------ |
| 34 | |
| 35 | def to_markdown(self, result: PluginEvalResult) -> str: |
| 36 | lines: list[str] = [] |
| 37 | |
| 38 | lines.append("# PluginEval Report") |
| 39 | lines.append("") |
| 40 | lines.append(f"**Path:** `{result.plugin_path}`") |
| 41 | lines.append(f"**Timestamp:** {result.timestamp}") |
| 42 | requested = Depth(result.config.depth) |
| 43 | effective = _effective_depth(result) |
| 44 | if effective is requested: |
| 45 | lines.append(f"**Depth:** {requested.value}") |
| 46 | else: |
| 47 | lines.append( |
| 48 | f"**Depth:** {requested.value} (requested) → {effective.value} (effective)" |
| 49 | ) |
| 50 | lines.append("") |
| 51 | |
| 52 | if effective is not requested: |
| 53 | lines.append( |
| 54 | "> **Note:** Requested depth `" |
| 55 | f"{requested.value}` was downgraded to `{effective.value}` " |
| 56 | "because plugin-level evaluation only runs the static layer. " |
| 57 | "Judge and Monte Carlo layers require per-skill evaluation — " |
| 58 | "point at an individual skill directory to use the deeper " |
| 59 | "layers. Composite score and confidence reflect the layers " |
| 60 | "actually run." |
| 61 | ) |
| 62 | lines.append("") |
| 63 | |
| 64 | # Overall Score |
| 65 | lines.append("## Overall Score") |
| 66 | lines.append("") |
| 67 | if result.composite: |
| 68 | c = result.composite |
| 69 | score_str = f"{c.score:.1f}/100" |
| 70 | badge_str = c.badge.value.replace("_", " ").title() |
| 71 | lines.append("| Metric | Value |") |
| 72 | lines.append("|--------|-------|") |
| 73 | lines.append(f"| Score | **{score_str}** |") |
| 74 | lines.append(f"| Confidence | {c.confidence_label} |") |
| 75 | lines.append(f"| Badge | {badge_str} |") |
| 76 | if c.ci_lower is not None and c.ci_upper is not None: |
| 77 | lines.append(f"| 95% CI | [{c.ci_lower:.1f}, {c.ci_upper:.1f}] |") |
| 78 | if c.anti_pattern_penalty < 1.0: |
| 79 | penalty_pct = (1.0 - c.anti_pattern_penalty) * 100 |
| 80 | lines.append(f"| Anti-Pattern Penalty | -{penalty_pct:.0f}% |") |
| 81 | else: |
| 82 | lines.append("_No composite score available._") |
| 83 | lines.append("") |
| 84 | |
| 85 | # Elo Rating (if present) |
| 86 | if result.elo: |
| 87 | elo = result.elo |
| 88 | lines.append("## Elo Rating") |
| 89 | lines.append("") |
| 90 | lines.append("| Metric | Value |") |
| 91 | lines.append("|--------|-------|") |
| 92 | lines.append(f"| Rating | {elo.rating:.0f} |") |