| 243 | |
| 244 | |
| 245 | def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: |
| 246 | ap = argparse.ArgumentParser(description="Scrape curated GitHub/local repos into SmallCode RAG JSONL snippets.") |
| 247 | ap.add_argument("--config", type=Path, default=None, help="Path to .smallcode/rag/repos.json") |
| 248 | ap.add_argument("--cache-dir", type=Path, default=Path(".smallcode/rag/repos")) |
| 249 | ap.add_argument("--out", type=Path, required=True, help="Output JSONL file") |
| 250 | ap.add_argument("--preset", default=None, help="Curated preset: starter, broad, or none") |
| 251 | ap.add_argument("--repo", action="append", default=[], help="Extra Git URL or local path to scrape") |
| 252 | ap.add_argument("--languages", default="", help="Optional comma-separated language allowlist") |
| 253 | ap.add_argument("--max-files-per-repo", type=int, default=1000) |
| 254 | ap.add_argument("--max-snippets-per-repo", type=int, default=4000) |
| 255 | ap.add_argument("--max-file-bytes", type=int, default=250_000) |
| 256 | ap.add_argument("--chunk-lines", type=int, default=80) |
| 257 | ap.add_argument("--overlap", type=int, default=20) |
| 258 | ap.add_argument("--min-chars", type=int, default=120) |
| 259 | return ap.parse_args(argv) |
| 260 | |
| 261 | |
| 262 | def main(argv: Optional[Sequence[str]] = None) -> int: |