Get document chunks from a pdf source, with page references in the document metadata. Returns: List[Document]: a list of `Document` objects, each containing a chunk of text
(self)
| 379 | return Document(content=text, metadata=DocMetaData(source=self.source)) |
| 380 | |
| 381 | def get_doc_chunks(self) -> List[Document]: |
| 382 | """ |
| 383 | Get document chunks from a pdf source, |
| 384 | with page references in the document metadata. |
| 385 | |
| 386 | Returns: |
| 387 | List[Document]: a list of `Document` objects, |
| 388 | each containing a chunk of text |
| 389 | """ |
| 390 | |
| 391 | split = [] # tokens in curr split |
| 392 | pages: List[str] = [] |
| 393 | docs: List[Document] = [] |
| 394 | # metadata.id to be shared by ALL chunks of this document |
| 395 | common_id = ObjectRegistry.new_id() |
| 396 | n_chunks = 0 # how many chunk so far |
| 397 | for i, page in self.iterate_pages(): |
| 398 | # not used but could be useful, esp to blend the |
| 399 | # metadata from the pages into the chunks |
| 400 | page_doc = self.get_document_from_page(page) |
| 401 | page_text = page_doc.content |
| 402 | split += self.tokenizer.encode(page_text) |
| 403 | pages.append(str(i + 1)) |
| 404 | # split could be so long it needs to be split |
| 405 | # into multiple chunks. Or it could be so short |
| 406 | # that it needs to be combined with the next chunk. |
| 407 | while len(split) > self.config.chunk_size: |
| 408 | # pretty formatting of pages (e.g. 1-3, 4, 5-7) |
| 409 | p_0 = int(pages[0]) - self.config.page_number_offset |
| 410 | p_n = int(pages[-1]) - self.config.page_number_offset |
| 411 | page_str = f"pages {p_0}-{p_n}" if p_0 != p_n else f"page {p_0}" |
| 412 | text = self.tokenizer.decode(split[: self.config.chunk_size]) |
| 413 | docs.append( |
| 414 | Document( |
| 415 | content=text, |
| 416 | metadata=DocMetaData( |
| 417 | source=f"{self.source} {page_str}", |
| 418 | is_chunk=True, |
| 419 | id=common_id, |
| 420 | ), |
| 421 | ) |
| 422 | ) |
| 423 | n_chunks += 1 |
| 424 | split = split[self.config.chunk_size - self.config.overlap :] |
| 425 | pages = [str(i + 1)] |
| 426 | # there may be a last split remaining: |
| 427 | # if it's shorter than the overlap, we shouldn't make a chunk for it |
| 428 | # since it's already included in the prior chunk; |
| 429 | # the only exception is if there have been no chunks so far. |
| 430 | if len(split) > self.config.overlap or n_chunks == 0: |
| 431 | p_0 = int(pages[0]) - self.config.page_number_offset |
| 432 | p_n = int(pages[-1]) - self.config.page_number_offset |
| 433 | page_str = f"pages {p_0}-{p_n}" if p_0 != p_n else f"page {p_0}" |
| 434 | text = self.tokenizer.decode(split[: self.config.chunk_size]) |
| 435 | docs.append( |
| 436 | Document( |
| 437 | content=text, |
| 438 | metadata=DocMetaData( |