| 13 | |
| 14 | |
| 15 | def _build_parser() -> argparse.ArgumentParser: |
| 16 | p = argparse.ArgumentParser( |
| 17 | prog="evaluate", |
| 18 | description=( |
| 19 | "Score saved agent trajectories against scenario files and " |
| 20 | "emit a JSON report." |
| 21 | ), |
| 22 | ) |
| 23 | p.add_argument( |
| 24 | "--trajectories", |
| 25 | type=Path, |
| 26 | required=True, |
| 27 | help="Directory of {run_id}.json trajectory files (or a single file).", |
| 28 | ) |
| 29 | p.add_argument( |
| 30 | "--scenarios", |
| 31 | type=Path, |
| 32 | nargs="+", |
| 33 | required=True, |
| 34 | help="One or more scenario JSON / JSONL files.", |
| 35 | ) |
| 36 | p.add_argument( |
| 37 | "--reports-dir", |
| 38 | type=Path, |
| 39 | default=Path("reports"), |
| 40 | help=( |
| 41 | "Directory to write per-run JSON reports (one file per run, " |
| 42 | "named '<run_id>.json'), plus '_aggregate.json' for the rollup. " |
| 43 | "Default: reports/." |
| 44 | ), |
| 45 | ) |
| 46 | p.add_argument( |
| 47 | "--scorer-default", |
| 48 | dest="scorer_default", |
| 49 | default="llm_judge", |
| 50 | help="Scorer name when scenario.scoring_method is unset. " |
| 51 | "Default: llm_judge.", |
| 52 | ) |
| 53 | p.add_argument( |
| 54 | "--judge-model", |
| 55 | default=None, |
| 56 | help="Model id for the LLM-As-Judge scorer (e.g. " |
| 57 | "litellm_proxy/anthropic/claude-opus-4-5). " |
| 58 | "Required when any scenario routes to llm_judge.", |
| 59 | ) |
| 60 | p.add_argument( |
| 61 | "-v", |
| 62 | "--verbose", |
| 63 | action="store_true", |
| 64 | help="Enable INFO-level logging.", |
| 65 | ) |
| 66 | return p |
| 67 | |
| 68 | |
| 69 | def _maybe_install_judge(judge_model: str | None) -> None: |