MCPcopy
hub / github.com/langroid/langroid / test_get_pdf_doc_url

Function test_get_pdf_doc_url

tests/main/test_pdf_parser.py:35–69  ·  view source on GitHub ↗
(source, pdflib: str)

Source from the content-addressed store, hash-verified

33 ],
34)
35def test_get_pdf_doc_url(source, pdflib: str):
36 url = "tests/main/data/openr-1-3.pdf"
37 pdf_parser = DocumentParser.create(
38 url,
39 ParsingConfig(
40 n_neighbor_ids=2,
41 pdf=PdfParsingConfig(library=pdflib),
42 ),
43 )
44
45 if source == "bytes":
46 bytes = pdf_parser._load_doc_as_bytesio()
47 pdf_parser = DocumentParser.create(
48 bytes.getvalue(), pdf_parser.config # convert BytesIO to bytes
49 )
50
51 doc = pdf_parser.get_doc()
52
53 # PdfParser.get_doc_from_pdf_url(url)
54
55 # Check the results
56 assert isinstance(doc.content, str)
57 assert len(doc.content) > 0 # assuming the PDF is not empty
58 assert doc.metadata.source == ("bytes" if source == "bytes" else url)
59
60 # parser = Parser(ParsingConfig())
61 # pdfParser = PdfParser.from_Parser(parser)
62 # docs = pdfParser.doc_chunks_from_pdf_url(url, parser)
63 docs = pdf_parser.get_doc_chunks()
64 assert len(docs) > 0
65 assert all(d.metadata.is_chunk for d in docs)
66 n = len(docs)
67 k = pdf_parser.config.n_neighbor_ids
68 if n > 2 * k + 1:
69 assert len(docs[n // 2].metadata.window_ids) == 2 * k + 1
70
71
72@pytest.mark.xfail(

Callers

nothing calls this directly

Calls 6

ParsingConfigClass · 0.90
PdfParsingConfigClass · 0.90
_load_doc_as_bytesioMethod · 0.80
get_docMethod · 0.80
get_doc_chunksMethod · 0.80
createMethod · 0.45

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…