MCPcopy
hub / github.com/langroid/langroid / get_doc_chunks

Method get_doc_chunks

langroid/parsing/document_parser.py:381–446  ·  view source on GitHub ↗

Get document chunks from a pdf source, with page references in the document metadata. Returns: List[Document]: a list of `Document` objects, each containing a chunk of text

(self)

Source from the content-addressed store, hash-verified

379 return Document(content=text, metadata=DocMetaData(source=self.source))
380
381 def get_doc_chunks(self) -> List[Document]:
382 """
383 Get document chunks from a pdf source,
384 with page references in the document metadata.
385
386 Returns:
387 List[Document]: a list of `Document` objects,
388 each containing a chunk of text
389 """
390
391 split = [] # tokens in curr split
392 pages: List[str] = []
393 docs: List[Document] = []
394 # metadata.id to be shared by ALL chunks of this document
395 common_id = ObjectRegistry.new_id()
396 n_chunks = 0 # how many chunk so far
397 for i, page in self.iterate_pages():
398 # not used but could be useful, esp to blend the
399 # metadata from the pages into the chunks
400 page_doc = self.get_document_from_page(page)
401 page_text = page_doc.content
402 split += self.tokenizer.encode(page_text)
403 pages.append(str(i + 1))
404 # split could be so long it needs to be split
405 # into multiple chunks. Or it could be so short
406 # that it needs to be combined with the next chunk.
407 while len(split) > self.config.chunk_size:
408 # pretty formatting of pages (e.g. 1-3, 4, 5-7)
409 p_0 = int(pages[0]) - self.config.page_number_offset
410 p_n = int(pages[-1]) - self.config.page_number_offset
411 page_str = f"pages {p_0}-{p_n}" if p_0 != p_n else f"page {p_0}"
412 text = self.tokenizer.decode(split[: self.config.chunk_size])
413 docs.append(
414 Document(
415 content=text,
416 metadata=DocMetaData(
417 source=f"{self.source} {page_str}",
418 is_chunk=True,
419 id=common_id,
420 ),
421 )
422 )
423 n_chunks += 1
424 split = split[self.config.chunk_size - self.config.overlap :]
425 pages = [str(i + 1)]
426 # there may be a last split remaining:
427 # if it's shorter than the overlap, we shouldn't make a chunk for it
428 # since it's already included in the prior chunk;
429 # the only exception is if there have been no chunks so far.
430 if len(split) > self.config.overlap or n_chunks == 0:
431 p_0 = int(pages[0]) - self.config.page_number_offset
432 p_n = int(pages[-1]) - self.config.page_number_offset
433 page_str = f"pages {p_0}-{p_n}" if p_0 != p_n else f"page {p_0}"
434 text = self.tokenizer.decode(split[: self.config.chunk_size])
435 docs.append(
436 Document(
437 content=text,
438 metadata=DocMetaData(

Callers 15

_process_documentMethod · 0.80
test_get_pdf_doc_urlFunction · 0.80
test_get_pdf_doc_pathFunction · 0.80
test_get_pdf_doc_urlFunction · 0.80
test_get_pdf_doc_pathFunction · 0.80
test_pypdfium2_parserFunction · 0.80
test_image_pdfFunction · 0.80
test_get_docx_fileFunction · 0.80

Calls 6

iterate_pagesMethod · 0.95
DocumentClass · 0.90
DocMetaDataClass · 0.90
new_idMethod · 0.80
add_window_idsMethod · 0.80

Tested by 14

test_get_pdf_doc_urlFunction · 0.64
test_get_pdf_doc_pathFunction · 0.64
test_get_pdf_doc_urlFunction · 0.64
test_get_pdf_doc_pathFunction · 0.64
test_pypdfium2_parserFunction · 0.64
test_image_pdfFunction · 0.64
test_get_docx_fileFunction · 0.64
test_llm_pdf_parserFunction · 0.64
test_marker_pdf_parserFunction · 0.64