(path, start_page=1, end_page=None)
| 28 | |
| 29 | |
| 30 | def pdf_to_text(path, start_page=1, end_page=None): |
| 31 | doc = fitz.open(path) |
| 32 | total_pages = doc.page_count |
| 33 | |
| 34 | if end_page is None: |
| 35 | end_page = total_pages |
| 36 | |
| 37 | text_list = [] |
| 38 | |
| 39 | for i in range(start_page - 1, end_page): |
| 40 | text = doc.load_page(i).get_text("text") |
| 41 | text = preprocess(text) |
| 42 | text_list.append(text) |
| 43 | |
| 44 | doc.close() |
| 45 | return text_list |
| 46 | |
| 47 | |
| 48 | def text_to_chunks(texts, word_length=150, start_page=1): |
no test coverage detected