(pdflib: str)
| 8 | |
| 9 | @pytest.mark.parametrize("pdflib", ["unstructured"]) |
| 10 | def test_get_pdf_doc_url(pdflib: str): |
| 11 | url = "https://arxiv.org/pdf/2104.05490.pdf" |
| 12 | pdf_parser = DocumentParser.create( |
| 13 | url, |
| 14 | ParsingConfig( |
| 15 | n_neighbor_ids=2, |
| 16 | pdf=PdfParsingConfig(library=pdflib), |
| 17 | ), |
| 18 | ) |
| 19 | doc = pdf_parser.get_doc() |
| 20 | # PdfParser.get_doc_from_pdf_url(url) |
| 21 | |
| 22 | # Check the results |
| 23 | assert isinstance(doc.content, str) |
| 24 | assert len(doc.content) > 0 # assuming the PDF is not empty |
| 25 | assert doc.metadata.source == url |
| 26 | |
| 27 | # parser = Parser(ParsingConfig()) |
| 28 | # pdfParser = PdfParser.from_Parser(parser) |
| 29 | # docs = pdfParser.doc_chunks_from_pdf_url(url, parser) |
| 30 | docs = pdf_parser.get_doc_chunks() |
| 31 | assert len(docs) > 0 |
| 32 | assert all(d.metadata.is_chunk for d in docs) |
| 33 | n = len(docs) |
| 34 | k = pdf_parser.config.n_neighbor_ids |
| 35 | if n > 2 * k + 1: |
| 36 | assert len(docs[n // 2].metadata.window_ids) == 2 * k + 1 |
| 37 | |
| 38 | |
| 39 | @pytest.mark.parametrize("pdflib", ["unstructured"]) |
nothing calls this directly
no test coverage detected
searching dependent graphs…