MCPcopy
hub / github.com/langroid/langroid / test_get_pdf_doc_path

Function test_get_pdf_doc_path

tests/main/test_pdf_parser.py:82–107  ·  view source on GitHub ↗
(source, pdflib: str)

Source from the content-addressed store, hash-verified

80 ["unstructured", "docling", "fitz", "pypdf", "pypdfium2", "pymupdf4llm", "marker"],
81)
82def test_get_pdf_doc_path(source, pdflib: str):
83 current_dir = os.path.dirname(os.path.abspath(__file__))
84 tests_root = os.path.abspath(os.path.join(current_dir, ".."))
85 path = os.path.join(tests_root, "main", "data", "dummy.pdf")
86
87 pdf_parser = DocumentParser.create(
88 path, ParsingConfig(pdf=PdfParsingConfig(library=pdflib))
89 )
90
91 if source == "bytes":
92 with open(path, "rb") as f:
93 bytes = f.read()
94 pdf_parser = DocumentParser.create(bytes, pdf_parser.config)
95
96 doc = pdf_parser.get_doc()
97
98 # Check the results
99 assert isinstance(doc.content, str)
100 assert len(doc.content) > 0 # assuming the PDF is not empty
101 citation = path if source == "path" else "bytes"
102 assert doc.metadata.source == citation
103
104 docs = pdf_parser.get_doc_chunks()
105 assert len(docs) > 0
106 assert all(d.metadata.is_chunk for d in docs)
107 assert all(citation in d.metadata.source for d in docs)
108
109
110def test_default_pdf_library_is_permissive():

Callers

nothing calls this directly

Calls 5

ParsingConfigClass · 0.90
PdfParsingConfigClass · 0.90
get_docMethod · 0.80
get_doc_chunksMethod · 0.80
createMethod · 0.45

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…