(spec: RepoSpec, repo_root: Path, options: argparse.Namespace)
| 173 | |
| 174 | |
| 175 | def scrape_repo(spec: RepoSpec, repo_root: Path, options: argparse.Namespace) -> Iterator[Dict[str, object]]: |
| 176 | max_files = spec.max_files or options.max_files_per_repo |
| 177 | max_snippets = spec.max_snippets or options.max_snippets_per_repo |
| 178 | languages = set(options.languages.split(",")) if options.languages else None |
| 179 | file_count = 0 |
| 180 | snippet_count = 0 |
| 181 | for file_path, lang in iter_code_files(repo_root, options.max_file_bytes, languages): |
| 182 | if file_count >= max_files or snippet_count >= max_snippets: |
| 183 | break |
| 184 | file_count += 1 |
| 185 | try: |
| 186 | text = file_path.read_text(encoding="utf-8", errors="ignore") |
| 187 | except OSError: |
| 188 | continue |
| 189 | rel = str(file_path.relative_to(repo_root)).replace(os.sep, "/") |
| 190 | for chunk in chunk_file(text, lang, options.chunk_lines, options.overlap, options.min_chars): |
| 191 | if snippet_count >= max_snippets: |
| 192 | break |
| 193 | code = str(chunk["code"]) |
| 194 | record = { |
| 195 | "id": snippet_id(spec.url, rel, int(chunk["startLine"]), code), |
| 196 | "repo": spec.url, |
| 197 | "repoName": spec.name or safe_repo_name(spec.url), |
| 198 | "tags": spec.tags, |
| 199 | "path": rel, |
| 200 | "lang": lang, |
| 201 | **chunk, |
| 202 | } |
| 203 | snippet_count += 1 |
| 204 | yield record |
| 205 | |
| 206 | |
| 207 | def load_curated(preset: str) -> List[RepoSpec]: |
no test coverage detected