Dedicated functional test for the default `pypdfium2` PDF parser. `pypdfium2` is installed by default (core dependency), so this test does NOT require any optional `extras` to be installed -- it exercises the out-of-the-box PDF-parsing path that a bare `pip install langroid` gets.
(source: str)
| 119 | |
| 120 | @pytest.mark.parametrize("source", ["path", "bytes"]) |
| 121 | def test_pypdfium2_parser(source: str): |
| 122 | """ |
| 123 | Dedicated functional test for the default `pypdfium2` PDF parser. |
| 124 | |
| 125 | `pypdfium2` is installed by default (core dependency), so this test does |
| 126 | NOT require any optional `extras` to be installed -- it exercises the |
| 127 | out-of-the-box PDF-parsing path that a bare `pip install langroid` gets. |
| 128 | """ |
| 129 | from langroid.parsing.document_parser import PyPDFium2Parser |
| 130 | |
| 131 | path = "tests/main/data/dummy.pdf" |
| 132 | parser = DocumentParser.create( |
| 133 | path, ParsingConfig(pdf=PdfParsingConfig(library="pypdfium2")) |
| 134 | ) |
| 135 | assert isinstance(parser, PyPDFium2Parser) |
| 136 | |
| 137 | if source == "bytes": |
| 138 | with open(path, "rb") as f: |
| 139 | data = f.read() |
| 140 | parser = DocumentParser.create(data, parser.config) |
| 141 | assert isinstance(parser, PyPDFium2Parser) |
| 142 | |
| 143 | citation = path if source == "path" else "bytes" |
| 144 | |
| 145 | doc = parser.get_doc() |
| 146 | assert isinstance(doc.content, str) |
| 147 | # content correctness: known text from the sample PDF |
| 148 | assert "Design and Evaluation" in doc.content |
| 149 | assert "arXiv:2004.07606v1" in doc.content |
| 150 | assert doc.metadata.source == citation |
| 151 | |
| 152 | chunks = parser.get_doc_chunks() |
| 153 | assert len(chunks) > 0 |
| 154 | assert all(c.metadata.is_chunk for c in chunks) |
| 155 | assert all(citation in c.metadata.source for c in chunks) |
| 156 | |
| 157 | |
| 158 | # @pytest.mark.skipif( |
nothing calls this directly
no test coverage detected
searching dependent graphs…