Function scrape_repo

scripts/rag_scraper.py:175–204 · view source on GitHub ↗

(spec: RepoSpec, repo_root: Path, options: argparse.Namespace)

Source from the content-addressed store, hash-verified

173
174
175	def scrape_repo(spec: RepoSpec, repo_root: Path, options: argparse.Namespace) -> Iterator[Dict[str, object]]:
176	max_files = spec.max_files or options.max_files_per_repo
177	max_snippets = spec.max_snippets or options.max_snippets_per_repo
178	languages = set(options.languages.split(",")) if options.languages else None
179	file_count = 0
180	snippet_count = 0
181	for file_path, lang in iter_code_files(repo_root, options.max_file_bytes, languages):
182	if file_count >= max_files or snippet_count >= max_snippets:
183	break
184	file_count += 1
185	try:
186	text = file_path.read_text(encoding="utf-8", errors="ignore")
187	except OSError:
188	continue
189	rel = str(file_path.relative_to(repo_root)).replace(os.sep, "/")
190	for chunk in chunk_file(text, lang, options.chunk_lines, options.overlap, options.min_chars):
191	if snippet_count >= max_snippets:
192	break
193	code = str(chunk["code"])
194	record = {
195	"id": snippet_id(spec.url, rel, int(chunk["startLine"]), code),
196	"repo": spec.url,
197	"repoName": spec.name or safe_repo_name(spec.url),
198	"tags": spec.tags,
199	"path": rel,
200	"lang": lang,
201	**chunk,
202	}
203	snippet_count += 1
204	yield record
205
206
207	def load_curated(preset: str) -> List[RepoSpec]:

mainFunction · 0.85

iter_code_filesFunction · 0.85

chunk_fileFunction · 0.85

snippet_idFunction · 0.85

safe_repo_nameFunction · 0.85

no test coverage detected