MCPcopy
hub / github.com/langroid/langroid / get_doc

Method get_doc

langroid/parsing/document_parser.py:364–379  ·  view source on GitHub ↗

Get entire text from source as a single document. Returns: a `Document` object containing the content of the pdf file, and metadata containing source name (URL or path)

(self)

Source from the content-addressed store, hash-verified

362 return re.sub(r" +\-", "-", text)
363
364 def get_doc(self) -> Document:
365 """
366 Get entire text from source as a single document.
367
368 Returns:
369 a `Document` object containing the content of the pdf file,
370 and metadata containing source name (URL or path)
371 """
372
373 text = "".join(
374 [
375 self.get_document_from_page(page).content
376 for _, page in self.iterate_pages()
377 ]
378 )
379 return Document(content=text, metadata=DocMetaData(source=self.source))
380
381 def get_doc_chunks(self) -> List[Document]:
382 """

Callers 15

test_get_pdf_doc_urlFunction · 0.80
test_get_pdf_doc_pathFunction · 0.80
test_get_pdf_doc_urlFunction · 0.80
test_get_pdf_doc_pathFunction · 0.80
test_pypdfium2_parserFunction · 0.80
test_image_pdfFunction · 0.80
test_get_docx_fileFunction · 0.80
test_llm_pdf_parserFunction · 0.80
test_marker_pdf_parserFunction · 0.80

Calls 4

iterate_pagesMethod · 0.95
DocumentClass · 0.90
DocMetaDataClass · 0.90

Tested by 14

test_get_pdf_doc_urlFunction · 0.64
test_get_pdf_doc_pathFunction · 0.64
test_get_pdf_doc_urlFunction · 0.64
test_get_pdf_doc_pathFunction · 0.64
test_pypdfium2_parserFunction · 0.64
test_image_pdfFunction · 0.64
test_get_docx_fileFunction · 0.64
test_llm_pdf_parserFunction · 0.64
test_marker_pdf_parserFunction · 0.64