(argv: Optional[Sequence[str]] = None)
| 260 | |
| 261 | |
| 262 | def main(argv: Optional[Sequence[str]] = None) -> int: |
| 263 | args = parse_args(argv) |
| 264 | cfg, specs = load_config(args.config, args.preset) |
| 265 | if cfg: |
| 266 | args.cache_dir = Path(str(cfg.get("cacheDir", args.cache_dir))) |
| 267 | args.max_files_per_repo = int(cfg.get("maxFilesPerRepo", args.max_files_per_repo)) |
| 268 | args.max_snippets_per_repo = int(cfg.get("maxSnippetsPerRepo", args.max_snippets_per_repo)) |
| 269 | args.max_file_bytes = int(cfg.get("maxFileBytes", args.max_file_bytes)) |
| 270 | args.chunk_lines = int(cfg.get("chunkLines", args.chunk_lines)) |
| 271 | args.overlap = int(cfg.get("overlap", args.overlap)) |
| 272 | args.min_chars = int(cfg.get("minChars", args.min_chars)) |
| 273 | if cfg.get("languages") and not args.languages: |
| 274 | args.languages = ",".join(cfg.get("languages")) if isinstance(cfg.get("languages"), list) else str(cfg.get("languages")) |
| 275 | for repo in args.repo: |
| 276 | specs.append(RepoSpec(url=repo, tags=[])) |
| 277 | if not specs: |
| 278 | print("No repositories configured. Use --preset starter/broad or add repos to repos.json.", file=sys.stderr) |
| 279 | return 2 |
| 280 | |
| 281 | args.out.parent.mkdir(parents=True, exist_ok=True) |
| 282 | total = 0 |
| 283 | with args.out.open("w", encoding="utf-8") as fh: |
| 284 | for spec in specs: |
| 285 | print(f"scraping {spec.url}", file=sys.stderr) |
| 286 | try: |
| 287 | repo_root = ensure_repo(spec, args.cache_dir) |
| 288 | count = 0 |
| 289 | for rec in scrape_repo(spec, repo_root, args): |
| 290 | fh.write(json.dumps(rec, ensure_ascii=False) + "\n") |
| 291 | count += 1 |
| 292 | total += count |
| 293 | print(f" snippets: {count}", file=sys.stderr) |
| 294 | except (subprocess.CalledProcessError, OSError) as exc: |
| 295 | print(f" skipped: {exc}", file=sys.stderr) |
| 296 | print(json.dumps({"snippets": total, "out": str(args.out)})) |
| 297 | return 0 |
| 298 | |
| 299 | |
| 300 | if __name__ == "__main__": |
no test coverage detected