(source, pdflib: str)
| 80 | ["unstructured", "docling", "fitz", "pypdf", "pypdfium2", "pymupdf4llm", "marker"], |
| 81 | ) |
| 82 | def test_get_pdf_doc_path(source, pdflib: str): |
| 83 | current_dir = os.path.dirname(os.path.abspath(__file__)) |
| 84 | tests_root = os.path.abspath(os.path.join(current_dir, "..")) |
| 85 | path = os.path.join(tests_root, "main", "data", "dummy.pdf") |
| 86 | |
| 87 | pdf_parser = DocumentParser.create( |
| 88 | path, ParsingConfig(pdf=PdfParsingConfig(library=pdflib)) |
| 89 | ) |
| 90 | |
| 91 | if source == "bytes": |
| 92 | with open(path, "rb") as f: |
| 93 | bytes = f.read() |
| 94 | pdf_parser = DocumentParser.create(bytes, pdf_parser.config) |
| 95 | |
| 96 | doc = pdf_parser.get_doc() |
| 97 | |
| 98 | # Check the results |
| 99 | assert isinstance(doc.content, str) |
| 100 | assert len(doc.content) > 0 # assuming the PDF is not empty |
| 101 | citation = path if source == "path" else "bytes" |
| 102 | assert doc.metadata.source == citation |
| 103 | |
| 104 | docs = pdf_parser.get_doc_chunks() |
| 105 | assert len(docs) > 0 |
| 106 | assert all(d.metadata.is_chunk for d in docs) |
| 107 | assert all(citation in d.metadata.source for d in docs) |
| 108 | |
| 109 | |
| 110 | def test_default_pdf_library_is_permissive(): |
nothing calls this directly
no test coverage detected
searching dependent graphs…