()
| 242 | |
| 243 | |
| 244 | def main(): |
| 245 | parser = argparse.ArgumentParser(description="Run eval + improve loop") |
| 246 | parser.add_argument("--eval-set", required=True, help="Path to eval set JSON file") |
| 247 | parser.add_argument("--skill-path", required=True, help="Path to skill directory") |
| 248 | parser.add_argument("--description", default=None, help="Override starting description") |
| 249 | parser.add_argument("--num-workers", type=int, default=10, help="Number of parallel workers") |
| 250 | parser.add_argument("--timeout", type=int, default=30, help="Timeout per query in seconds") |
| 251 | parser.add_argument("--max-iterations", type=int, default=5, help="Max improvement iterations") |
| 252 | parser.add_argument("--runs-per-query", type=int, default=3, help="Number of runs per query") |
| 253 | parser.add_argument("--trigger-threshold", type=float, default=0.5, help="Trigger rate threshold") |
| 254 | parser.add_argument("--holdout", type=float, default=0.4, help="Fraction of eval set to hold out for testing (0 to disable)") |
| 255 | parser.add_argument("--model", required=True, help="Model for improvement") |
| 256 | parser.add_argument("--verbose", action="store_true", help="Print progress to stderr") |
| 257 | parser.add_argument("--report", default="auto", help="Generate HTML report at this path (default: 'auto' for temp file, 'none' to disable)") |
| 258 | parser.add_argument("--results-dir", default=None, help="Save all outputs (results.json, report.html, log.txt) to a timestamped subdirectory here") |
| 259 | args = parser.parse_args() |
| 260 | |
| 261 | eval_set = json.loads(Path(args.eval_set).read_text()) |
| 262 | skill_path = Path(args.skill_path) |
| 263 | |
| 264 | if not (skill_path / "SKILL.md").exists(): |
| 265 | print(f"Error: No SKILL.md found at {skill_path}", file=sys.stderr) |
| 266 | sys.exit(1) |
| 267 | |
| 268 | name, _, _ = parse_skill_md(skill_path) |
| 269 | |
| 270 | # Set up live report path |
| 271 | if args.report != "none": |
| 272 | if args.report == "auto": |
| 273 | timestamp = time.strftime("%Y%m%d_%H%M%S") |
| 274 | live_report_path = Path(tempfile.gettempdir()) / f"skill_description_report_{skill_path.name}_{timestamp}.html" |
| 275 | else: |
| 276 | live_report_path = Path(args.report) |
| 277 | # Open the report immediately so the user can watch |
| 278 | live_report_path.write_text("<html><body><h1>Starting optimization loop...</h1><meta http-equiv='refresh' content='5'></body></html>") |
| 279 | webbrowser.open(str(live_report_path)) |
| 280 | else: |
| 281 | live_report_path = None |
| 282 | |
| 283 | # Determine output directory (create before run_loop so logs can be written) |
| 284 | if args.results_dir: |
| 285 | timestamp = time.strftime("%Y-%m-%d_%H%M%S") |
| 286 | results_dir = Path(args.results_dir) / timestamp |
| 287 | results_dir.mkdir(parents=True, exist_ok=True) |
| 288 | else: |
| 289 | results_dir = None |
| 290 | |
| 291 | log_dir = results_dir / "logs" if results_dir else None |
| 292 | |
| 293 | output = run_loop( |
| 294 | eval_set=eval_set, |
| 295 | skill_path=skill_path, |
| 296 | description_override=args.description, |
| 297 | num_workers=args.num_workers, |
| 298 | timeout=args.timeout, |
| 299 | max_iterations=args.max_iterations, |
| 300 | runs_per_query=args.runs_per_query, |
| 301 | trigger_threshold=args.trigger_threshold, |
no test coverage detected