MCPcopy
hub / github.com/colbymchenry/codegraph / collectGitFiles

Function collectGitFiles

src/extraction/index.ts:661–758  ·  view source on GitHub ↗

* Collect git-visible files (tracked + untracked, .gitignore-respected) from the * git repository rooted at `repoDir`, adding each to `files` with `prefix` * prepended so paths stay relative to the original scan root. * * Recurses into embedded git repositories — nested repos that are NOT submod

(repoDir: string, prefix: string, files: Set<string>, embeddedRoots?: Set<string>, includeIgnored: Ignore | null = null)

Source from the content-addressed store, hash-verified

659 * parent's own gitignore rules.
660 */
661function collectGitFiles(repoDir: string, prefix: string, files: Set<string>, embeddedRoots?: Set<string>, includeIgnored: Ignore | null = null): void {
662 const gitOpts = { cwd: repoDir, encoding: 'utf-8' as const, timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] as ['pipe', 'pipe', 'pipe'], windowsHide: true };
663
664 // Tracked files. --recurse-submodules pulls in files from active submodules,
665 // which the index would otherwise represent only as a commit pointer.
666 // Without this, monorepos using submodules index 0 files. (See issue #147.)
667 // Note: --recurse-submodules only supports -c/--cached and --stage modes — it
668 // can't be combined with -o, so untracked files are gathered separately below.
669 //
670 // We use --stage (-s) rather than -c so each entry carries its file mode. That
671 // lets us spot gitlink entries (mode 160000) that --recurse-submodules did NOT
672 // expand: a nested repo `git add`ed without a `.gitmodules` entry, or a
673 // submodule that isn't active/initialized in this checkout. Such a gitlink
674 // falls through every pass — it's tracked, so the untracked `-o` listing below
675 // never reports it, and --recurse-submodules only expands ACTIVE submodules —
676 // so its source would be silently skipped, leaving only the super-repo's own
677 // files indexed. We collect those gitlinks here and recurse into them below.
678 // (An active submodule is expanded inline by --recurse-submodules and so never
679 // surfaces as a 160000 entry — only the unhandled gitlinks do.) (#1031, #1033)
680 //
681 // -z gives NUL-separated, unquoted output so non-ASCII (e.g. CJK) paths
682 // survive verbatim. Without it git octal-escapes and double-quotes such paths
683 // (the core.quotepath default), and the quoted form never matches a real file
684 // on disk → those files are silently dropped from the index. (#541) With -s the
685 // path follows a TAB after the `<mode> <object> <stage>` prefix.
686 const gitlinkRels: string[] = [];
687 const tracked = execFileSync('git', ['ls-files', '-z', '-s', '--recurse-submodules'], gitOpts);
688 for (const entry of tracked.split('\0')) {
689 if (!entry) continue;
690 const tab = entry.indexOf('\t');
691 if (tab === -1) continue; // --stage always emits "<mode> <object> <stage>\t<path>"
692 const rel = entry.slice(tab + 1);
693 if (entry.slice(0, 6) === '160000') {
694 gitlinkRels.push(rel); // an unexpanded gitlink — recursed into below, not a source file itself
695 continue;
696 }
697 files.add(normalizePath(prefix + rel));
698 }
699
700 // Untracked files (submodules manage their own untracked state). Embedded git
701 // repos surface here as a single "subdir/" entry that git refuses to descend
702 // into — recurse into those as their own repos so their source gets indexed.
703 const untracked = execFileSync('git', ['ls-files', '-z', '-o', '--exclude-standard'], gitOpts);
704 for (const rel of untracked.split('\0')) {
705 if (!rel) continue;
706 if (rel.endsWith('/')) {
707 // git only emits a trailing-slash directory entry for an embedded repo.
708 // Guard with a .git check anyway, and skip anything else exactly as git
709 // itself skips it (we never descend into a non-repo opaque dir). Never
710 // descend into default-ignored locations — an embedded repo inside
711 // node_modules is an npm git-dependency, not project code.
712 const childDir = path.join(repoDir, rel);
713 // A git worktree surfaces here as an opaque untracked dir too — skip it,
714 // it's a duplicate working view of an already-indexed repo (#848).
715 if (classifyGitDir(childDir) === 'embedded' && !defaultsOnlyIgnore().ignores(rel)) {
716 embeddedRoots?.add(normalizePath(prefix + rel));
717 collectGitFiles(childDir, prefix + rel, files, embeddedRoots, includeIgnored);
718 }

Callers 1

getGitVisibleFilesFunction · 0.85

Calls 8

normalizePathFunction · 0.90
classifyGitDirFunction · 0.85
defaultsOnlyIgnoreFunction · 0.85
buildDefaultIgnoreFunction · 0.85
findIgnoredEmbeddedReposFunction · 0.85
joinMethod · 0.80
ignoresMethod · 0.80

Tested by

no test coverage detected