MCPcopy Index your code
hub / github.com/Doorman11991/smallcode / scrape_repo

Function scrape_repo

scripts/rag_scraper.py:175–204  ·  view source on GitHub ↗
(spec: RepoSpec, repo_root: Path, options: argparse.Namespace)

Source from the content-addressed store, hash-verified

173
174
175def scrape_repo(spec: RepoSpec, repo_root: Path, options: argparse.Namespace) -> Iterator[Dict[str, object]]:
176 max_files = spec.max_files or options.max_files_per_repo
177 max_snippets = spec.max_snippets or options.max_snippets_per_repo
178 languages = set(options.languages.split(",")) if options.languages else None
179 file_count = 0
180 snippet_count = 0
181 for file_path, lang in iter_code_files(repo_root, options.max_file_bytes, languages):
182 if file_count >= max_files or snippet_count >= max_snippets:
183 break
184 file_count += 1
185 try:
186 text = file_path.read_text(encoding="utf-8", errors="ignore")
187 except OSError:
188 continue
189 rel = str(file_path.relative_to(repo_root)).replace(os.sep, "/")
190 for chunk in chunk_file(text, lang, options.chunk_lines, options.overlap, options.min_chars):
191 if snippet_count >= max_snippets:
192 break
193 code = str(chunk["code"])
194 record = {
195 "id": snippet_id(spec.url, rel, int(chunk["startLine"]), code),
196 "repo": spec.url,
197 "repoName": spec.name or safe_repo_name(spec.url),
198 "tags": spec.tags,
199 "path": rel,
200 "lang": lang,
201 **chunk,
202 }
203 snippet_count += 1
204 yield record
205
206
207def load_curated(preset: str) -> List[RepoSpec]:

Callers 1

mainFunction · 0.85

Calls 4

iter_code_filesFunction · 0.85
chunk_fileFunction · 0.85
snippet_idFunction · 0.85
safe_repo_nameFunction · 0.85

Tested by

no test coverage detected