| 455 | table_cells[k].text_lines = text |
| 456 | |
| 457 | def assign_pdftext_lines(self, extract_blocks: list, filepath: str): |
| 458 | table_inputs = [] |
| 459 | unique_pages = list(set([t["page_id"] for t in extract_blocks])) |
| 460 | if len(unique_pages) == 0: |
| 461 | return |
| 462 | |
| 463 | for page in unique_pages: |
| 464 | tables = [] |
| 465 | img_size = None |
| 466 | for block in extract_blocks: |
| 467 | if block["page_id"] == page: |
| 468 | tables.append(block["table_bbox"]) |
| 469 | img_size = block["img_size"] |
| 470 | |
| 471 | table_inputs.append({"tables": tables, "img_size": img_size}) |
| 472 | cell_text = table_output( |
| 473 | filepath, |
| 474 | table_inputs, |
| 475 | page_range=unique_pages, |
| 476 | workers=self.pdftext_workers, |
| 477 | ) |
| 478 | assert len(cell_text) == len(unique_pages), ( |
| 479 | "Number of pages and table inputs must match" |
| 480 | ) |
| 481 | |
| 482 | for pidx, (page_tables, pnum) in enumerate(zip(cell_text, unique_pages)): |
| 483 | table_idx = 0 |
| 484 | for block in extract_blocks: |
| 485 | if block["page_id"] == pnum: |
| 486 | table_text = page_tables[table_idx] |
| 487 | if len(table_text) == 0: |
| 488 | block["ocr_block"] = ( |
| 489 | True # Re-OCR the block if pdftext didn't find any text |
| 490 | ) |
| 491 | else: |
| 492 | block["table_text_lines"] = page_tables[table_idx] |
| 493 | table_idx += 1 |
| 494 | assert table_idx == len(page_tables), ( |
| 495 | "Number of tables and table inputs must match" |
| 496 | ) |
| 497 | |
| 498 | def align_table_cells( |
| 499 | self, table: TableResult, table_detection_result: TextDetectionResult |