()
| 257 | |
| 258 | |
| 259 | def main(): |
| 260 | parser = argparse.ArgumentParser(description="Run trigger evaluation for a skill description") |
| 261 | parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") |
| 262 | parser.add_argument("--skill-path", required=True, help="Path to skill directory") |
| 263 | parser.add_argument("--description", default=None, help="Override description to test") |
| 264 | parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") |
| 265 | parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds") |
| 266 | parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query") |
| 267 | parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold") |
| 268 | parser.add_argument("--model", default=None, help="Model to use for claude -p (default: user's configured model)") |
| 269 | parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") |
| 270 | args = parser.parse_args() |
| 271 | |
| 272 | eval_set = json.loads(Path(args.eval_set).read_text()) |
| 273 | skill_path = Path(args.skill_path) |
| 274 | |
| 275 | if not (skill_path / "SKILL.md").exists(): |
| 276 | print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr) |
| 277 | sys.exit(1) |
| 278 | |
| 279 | name, original_description, content = parse_skill_md(skill_path) |
| 280 | description = args.description or original_description |
| 281 | project_root = find_project_root() |
| 282 | |
| 283 | if args.verbose: |
| 284 | print(f"Evaluating: {description}", file=sys.stderr) |
| 285 | |
| 286 | output = run_eval( |
| 287 | eval_set=eval_set, |
| 288 | skill_name=name, |
| 289 | description=description, |
| 290 | num_workers=args.num_workers, |
| 291 | timeout=args.timeout, |
| 292 | project_root=project_root, |
| 293 | runs_per_query=args.runs_per_query, |
| 294 | trigger_threshold=args.trigger_threshold, |
| 295 | model=args.model, |
| 296 | ) |
| 297 | |
| 298 | if args.verbose: |
| 299 | summary = output["summary"] |
| 300 | print(f"Results: {summary['passed']}/{summary['total']} passed", file=sys.stderr) |
| 301 | for r in output["results"]: |
| 302 | status = "PASS" if r["pass"] else "FAIL" |
| 303 | rate_str = f"{r['triggers']}/{r['runs']}" |
| 304 | print(f" [{status}] rate={rate_str} expected={r['should_trigger']}: {r['query'][:70]}", file=sys.stderr) |
| 305 | |
| 306 | print(json.dumps(output, indent=2)) |
| 307 | |
| 308 | |
| 309 | if __name__ == "__main__": |
no test coverage detected