()
| 393 | } |
| 394 | |
| 395 | async function main() { |
| 396 | const args = parseArgs(); |
| 397 | console.log("Args:", args); |
| 398 | |
| 399 | await mkdir(dirname(args.output), { recursive: true }); |
| 400 | |
| 401 | const candidates = await discoverCandidates(args); |
| 402 | |
| 403 | console.log(`Total unique candidates: ${candidates.size}`); |
| 404 | |
| 405 | const resumeIds = args.resume ? await loadResumeIds(args.output) : new Set(); |
| 406 | if (args.resume) { |
| 407 | console.log(`Resume: skipping ${resumeIds.size} already-extracted repos.`); |
| 408 | } |
| 409 | |
| 410 | const outStream = createWriteStream(args.output, { |
| 411 | flags: args.resume ? "a" : "w", |
| 412 | }); |
| 413 | const failedStream = createWriteStream(`${args.output}.failed.jsonl`, { |
| 414 | flags: args.resume ? "a" : "w", |
| 415 | }); |
| 416 | |
| 417 | const summary = { |
| 418 | total_candidates: candidates.size, |
| 419 | after_filter: 0, |
| 420 | parsed_ok: 0, |
| 421 | parse_failed: 0, |
| 422 | skipped_fork: 0, |
| 423 | skipped_archived: 0, |
| 424 | skipped_unreadable: 0, |
| 425 | skipped_low_stars: 0, |
| 426 | skipped_resume: 0, |
| 427 | by_source: {} as Record<string, number>, |
| 428 | by_component_type: {} as Record<string, number>, |
| 429 | }; |
| 430 | |
| 431 | let processed = 0; |
| 432 | for (const [key, candidate] of candidates) { |
| 433 | if (processed >= args.limit) break; |
| 434 | processed++; |
| 435 | |
| 436 | const url = `https://github.com/${candidate.owner}/${candidate.repo}`; |
| 437 | const prefix = `[${processed}/${Math.min(candidates.size, args.limit)}] ${key}`; |
| 438 | |
| 439 | let meta: GitHubRepoMeta | null; |
| 440 | try { |
| 441 | meta = await fetchGitHubRepoMeta(candidate.owner, candidate.repo); |
| 442 | } catch (err) { |
| 443 | console.warn( |
| 444 | `${prefix} repo meta fetch failed: ${err instanceof Error ? err.message : err}`, |
| 445 | ); |
| 446 | summary.skipped_unreadable++; |
| 447 | failedStream.write( |
| 448 | `${JSON.stringify({ |
| 449 | url, |
| 450 | source: candidate.source, |
| 451 | reason: "meta_fetch_failed", |
| 452 | })}\n`, |
no test coverage detected