(source, pdflib: str)
| 33 | ], |
| 34 | ) |
| 35 | def test_get_pdf_doc_url(source, pdflib: str): |
| 36 | url = "tests/main/data/openr-1-3.pdf" |
| 37 | pdf_parser = DocumentParser.create( |
| 38 | url, |
| 39 | ParsingConfig( |
| 40 | n_neighbor_ids=2, |
| 41 | pdf=PdfParsingConfig(library=pdflib), |
| 42 | ), |
| 43 | ) |
| 44 | |
| 45 | if source == "bytes": |
| 46 | bytes = pdf_parser._load_doc_as_bytesio() |
| 47 | pdf_parser = DocumentParser.create( |
| 48 | bytes.getvalue(), pdf_parser.config # convert BytesIO to bytes |
| 49 | ) |
| 50 | |
| 51 | doc = pdf_parser.get_doc() |
| 52 | |
| 53 | # PdfParser.get_doc_from_pdf_url(url) |
| 54 | |
| 55 | # Check the results |
| 56 | assert isinstance(doc.content, str) |
| 57 | assert len(doc.content) > 0 # assuming the PDF is not empty |
| 58 | assert doc.metadata.source == ("bytes" if source == "bytes" else url) |
| 59 | |
| 60 | # parser = Parser(ParsingConfig()) |
| 61 | # pdfParser = PdfParser.from_Parser(parser) |
| 62 | # docs = pdfParser.doc_chunks_from_pdf_url(url, parser) |
| 63 | docs = pdf_parser.get_doc_chunks() |
| 64 | assert len(docs) > 0 |
| 65 | assert all(d.metadata.is_chunk for d in docs) |
| 66 | n = len(docs) |
| 67 | k = pdf_parser.config.n_neighbor_ids |
| 68 | if n > 2 * k + 1: |
| 69 | assert len(docs[n // 2].metadata.window_ids) == 2 * k + 1 |
| 70 | |
| 71 | |
| 72 | @pytest.mark.xfail( |
nothing calls this directly
no test coverage detected
searching dependent graphs…