Function test_get_pdf_doc_url

tests/main/test_pdf_parser.py:35–69 · view source on GitHub ↗

(source, pdflib: str)

Source from the content-addressed store, hash-verified

33	],
34	)
35	def test_get_pdf_doc_url(source, pdflib: str):
36	url = "tests/main/data/openr-1-3.pdf"
37	pdf_parser = DocumentParser.create(
38	url,
39	ParsingConfig(
40	n_neighbor_ids=2,
41	pdf=PdfParsingConfig(library=pdflib),
42	),
43	)
44
45	if source == "bytes":
46	bytes = pdf_parser._load_doc_as_bytesio()
47	pdf_parser = DocumentParser.create(
48	bytes.getvalue(), pdf_parser.config # convert BytesIO to bytes
49	)
50
51	doc = pdf_parser.get_doc()
52
53	# PdfParser.get_doc_from_pdf_url(url)
54
55	# Check the results
56	assert isinstance(doc.content, str)
57	assert len(doc.content) > 0 # assuming the PDF is not empty
58	assert doc.metadata.source == ("bytes" if source == "bytes" else url)
59
60	# parser = Parser(ParsingConfig())
61	# pdfParser = PdfParser.from_Parser(parser)
62	# docs = pdfParser.doc_chunks_from_pdf_url(url, parser)
63	docs = pdf_parser.get_doc_chunks()
64	assert len(docs) > 0
65	assert all(d.metadata.is_chunk for d in docs)
66	n = len(docs)
67	k = pdf_parser.config.n_neighbor_ids
68	if n > 2 * k + 1:
69	assert len(docs[n // 2].metadata.window_ids) == 2 * k + 1
70
71
72	@pytest.mark.xfail(

nothing calls this directly

ParsingConfigClass · 0.90

PdfParsingConfigClass · 0.90

_load_doc_as_bytesioMethod · 0.80

get_docMethod · 0.80

get_doc_chunksMethod · 0.80

createMethod · 0.45

no test coverage detected

searching dependent graphs…