MCPcopy
hub / github.com/langroid/langroid / get_documents

Method get_documents

langroid/parsing/repo_loader.py:511–583  ·  view source on GitHub ↗

Recursively get all files under a path as Document objects. Args: path (str|bytes): The path to the directory or file, or bytes content. The bytes option is meant to support the case where the content has already been read from a file in

(
        path: str | bytes,
        parser: Parser = Parser(ParsingConfig()),
        file_types: Optional[List[str]] = None,
        exclude_dirs: Optional[List[str]] = None,
        depth: int = -1,
        lines: Optional[int] = None,
        doc_type: str | DocumentType | None = None,
    )

Source from the content-addressed store, hash-verified

509
510 @staticmethod
511 def get_documents(
512 path: str | bytes,
513 parser: Parser = Parser(ParsingConfig()),
514 file_types: Optional[List[str]] = None,
515 exclude_dirs: Optional[List[str]] = None,
516 depth: int = -1,
517 lines: Optional[int] = None,
518 doc_type: str | DocumentType | None = None,
519 ) -> List[Document]:
520 """
521 Recursively get all files under a path as Document objects.
522
523 Args:
524 path (str|bytes): The path to the directory or file, or bytes content.
525 The bytes option is meant to support the case where the content
526 has already been read from a file in an upstream process
527 (e.g. from an API or a database), and we want to avoid having to
528 write it to a temporary file just to read it again.
529 (which can be very slow for large files,
530 especially in a docker container)
531 parser (Parser): Parser to use to parse files.
532 file_types (List[str], optional): List of file extensions OR
533 filenames OR file_path_names to include.
534 Defaults to None, which includes all files.
535 exclude_dirs (List[str], optional): List of directories to exclude.
536 Defaults to None, which includes all directories.
537 depth (int, optional): Max depth of recursion. Defaults to -1,
538 which includes all depths.
539 lines (int, optional): Number of lines to read from each file.
540 Defaults to None, which reads all lines.
541 doc_type (str|DocumentType | None, optional): The type of document to parse.
542 Returns:
543 List[Document]: List of Document objects representing files.
544
545 """
546 docs = []
547 file_paths = []
548 if isinstance(path, bytes):
549 file_paths.append(path)
550 else:
551 path_obj = Path(path).resolve()
552
553 if path_obj.is_file():
554 file_paths.append(str(path_obj))
555 else:
556 path_depth = len(path_obj.parts)
557 for root, dirs, files in os.walk(path):
558 # Exclude directories if needed
559 if exclude_dirs:
560 dirs[:] = [d for d in dirs if d not in exclude_dirs]
561
562 current_depth = len(Path(root).resolve().parts) - path_depth
563 if depth == -1 or current_depth <= depth:
564 for file in files:
565 file_path = str(Path(root) / file)
566 if (
567 file_types is None
568 or RepoLoader._file_type(file_path) in file_types

Callers 4

_async_get_documentsMethod · 0.80
ingest_doc_pathsMethod · 0.80
test_repo_loaderFunction · 0.80

Calls 4

ParserClass · 0.90
ParsingConfigClass · 0.90
_file_typeMethod · 0.80

Tested by 1

test_repo_loaderFunction · 0.64