Recursively get all files under a path as Document objects. Args: path (str|bytes): The path to the directory or file, or bytes content. The bytes option is meant to support the case where the content has already been read from a file in
(
path: str | bytes,
parser: Parser = Parser(ParsingConfig()),
file_types: Optional[List[str]] = None,
exclude_dirs: Optional[List[str]] = None,
depth: int = -1,
lines: Optional[int] = None,
doc_type: str | DocumentType | None = None,
)
| 509 | |
| 510 | @staticmethod |
| 511 | def get_documents( |
| 512 | path: str | bytes, |
| 513 | parser: Parser = Parser(ParsingConfig()), |
| 514 | file_types: Optional[List[str]] = None, |
| 515 | exclude_dirs: Optional[List[str]] = None, |
| 516 | depth: int = -1, |
| 517 | lines: Optional[int] = None, |
| 518 | doc_type: str | DocumentType | None = None, |
| 519 | ) -> List[Document]: |
| 520 | """ |
| 521 | Recursively get all files under a path as Document objects. |
| 522 | |
| 523 | Args: |
| 524 | path (str|bytes): The path to the directory or file, or bytes content. |
| 525 | The bytes option is meant to support the case where the content |
| 526 | has already been read from a file in an upstream process |
| 527 | (e.g. from an API or a database), and we want to avoid having to |
| 528 | write it to a temporary file just to read it again. |
| 529 | (which can be very slow for large files, |
| 530 | especially in a docker container) |
| 531 | parser (Parser): Parser to use to parse files. |
| 532 | file_types (List[str], optional): List of file extensions OR |
| 533 | filenames OR file_path_names to include. |
| 534 | Defaults to None, which includes all files. |
| 535 | exclude_dirs (List[str], optional): List of directories to exclude. |
| 536 | Defaults to None, which includes all directories. |
| 537 | depth (int, optional): Max depth of recursion. Defaults to -1, |
| 538 | which includes all depths. |
| 539 | lines (int, optional): Number of lines to read from each file. |
| 540 | Defaults to None, which reads all lines. |
| 541 | doc_type (str|DocumentType | None, optional): The type of document to parse. |
| 542 | Returns: |
| 543 | List[Document]: List of Document objects representing files. |
| 544 | |
| 545 | """ |
| 546 | docs = [] |
| 547 | file_paths = [] |
| 548 | if isinstance(path, bytes): |
| 549 | file_paths.append(path) |
| 550 | else: |
| 551 | path_obj = Path(path).resolve() |
| 552 | |
| 553 | if path_obj.is_file(): |
| 554 | file_paths.append(str(path_obj)) |
| 555 | else: |
| 556 | path_depth = len(path_obj.parts) |
| 557 | for root, dirs, files in os.walk(path): |
| 558 | # Exclude directories if needed |
| 559 | if exclude_dirs: |
| 560 | dirs[:] = [d for d in dirs if d not in exclude_dirs] |
| 561 | |
| 562 | current_depth = len(Path(root).resolve().parts) - path_depth |
| 563 | if depth == -1 or current_depth <= depth: |
| 564 | for file in files: |
| 565 | file_path = str(Path(root) / file) |
| 566 | if ( |
| 567 | file_types is None |
| 568 | or RepoLoader._file_type(file_path) in file_types |