hub / github.com/wshobson/agents / _run_score

Function _run_score

plugins/plugin-eval/src/plugin_eval/cli.py:32–98 · view source on GitHub ↗

Core scoring logic; returns exit code.

(
    path: Path,
    depth: Depth,
    output: str,
    verbose: bool,
    concurrency: int,
    threshold: float | None,
)

Source from the content-addressed store, hash-verified

30
31
32	def _run_score(
33	path: Path,
34	depth: Depth,
35	output: str,
36	verbose: bool,
37	concurrency: int,
38	threshold: float \| None,
39	) -> int:
40	"""Core scoring logic; returns exit code."""
41	if not path.exists():
42	console.print(f"[red]Error: Path does not exist: {path}[/red]")
43	raise typer.Exit(code=2)
44
45	config = EvalConfig(
46	depth=depth,
47	output_format=output,
48	verbose=verbose,
49	concurrency=concurrency,
50	)
51	engine = EvalEngine(config)
52
53	target = _detect_target(path)
54	if target == "skill":
55	result = engine.evaluate_skill(path)
56	elif target == "plugin":
57	if depth != Depth.QUICK:
58	stderr_console.print(
59	f"[yellow]warning:[/yellow] plugin-level evaluation only runs the "
60	f"static layer; judge and Monte Carlo layers require per-skill "
61	f"evaluation. Requested depth [bold]{depth.value}[/bold] will be "
62	f"served from the static layer only — confidence label will be "
63	f"[bold]Estimated[/bold] regardless. To use the deeper layers, "
64	f"point at an individual skill directory."
65	)
66	result = engine.evaluate_plugin(path)
67	else:
68	# Attempt skill evaluation as fallback
69	result = engine.evaluate_skill(path)
70
71	reporter = Reporter()
72	if output == "json":
73	typer.echo(reporter.to_json(result))
74	elif output == "html":
75	typer.echo(reporter.to_html(result))
76	else:
77	# Default: markdown
78	typer.echo(reporter.to_markdown(result))
79
80	judge_layer = next((lr for lr in result.layers if lr.layer == "judge"), None)
81	if judge_layer is not None:
82	unmeasured = judge_layer.metadata.get("unmeasured") or []
83	if unmeasured:
84	stderr_console.print(
85	f"[yellow]warning:[/yellow] LLM judge could not measure "
86	f"{', '.join(unmeasured)}; composite computed from the remaining "
87	f"layers. Check that claude-agent-sdk is installed and a model is "
88	f"configured (run with --verbose for details)."
89	)

Callers 2

scoreFunction · 0.85

certifyFunction · 0.85

Calls 9

evaluate_skillMethod · 0.95

evaluate_pluginMethod · 0.95

to_jsonMethod · 0.95

to_htmlMethod · 0.95

to_markdownMethod · 0.95

EvalConfigClass · 0.90

EvalEngineClass · 0.90

ReporterClass · 0.90

_detect_targetFunction · 0.85

Tested by

no test coverage detected