Get entire text from source as a single document. Returns: a `Document` object containing the content of the pdf file, and metadata containing source name (URL or path)
(self)
| 362 | return re.sub(r" +\-", "-", text) |
| 363 | |
| 364 | def get_doc(self) -> Document: |
| 365 | """ |
| 366 | Get entire text from source as a single document. |
| 367 | |
| 368 | Returns: |
| 369 | a `Document` object containing the content of the pdf file, |
| 370 | and metadata containing source name (URL or path) |
| 371 | """ |
| 372 | |
| 373 | text = "".join( |
| 374 | [ |
| 375 | self.get_document_from_page(page).content |
| 376 | for _, page in self.iterate_pages() |
| 377 | ] |
| 378 | ) |
| 379 | return Document(content=text, metadata=DocMetaData(source=self.source)) |
| 380 | |
| 381 | def get_doc_chunks(self) -> List[Document]: |
| 382 | """ |