MCPcopy
hub / github.com/datalab-to/marker / assign_pdftext_lines

Method assign_pdftext_lines

marker/processors/table.py:457–496  ·  view source on GitHub ↗
(self, extract_blocks: list, filepath: str)

Source from the content-addressed store, hash-verified

455 table_cells[k].text_lines = text
456
457 def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
458 table_inputs = []
459 unique_pages = list(set([t["page_id"] for t in extract_blocks]))
460 if len(unique_pages) == 0:
461 return
462
463 for page in unique_pages:
464 tables = []
465 img_size = None
466 for block in extract_blocks:
467 if block["page_id"] == page:
468 tables.append(block["table_bbox"])
469 img_size = block["img_size"]
470
471 table_inputs.append({"tables": tables, "img_size": img_size})
472 cell_text = table_output(
473 filepath,
474 table_inputs,
475 page_range=unique_pages,
476 workers=self.pdftext_workers,
477 )
478 assert len(cell_text) == len(unique_pages), (
479 "Number of pages and table inputs must match"
480 )
481
482 for pidx, (page_tables, pnum) in enumerate(zip(cell_text, unique_pages)):
483 table_idx = 0
484 for block in extract_blocks:
485 if block["page_id"] == pnum:
486 table_text = page_tables[table_idx]
487 if len(table_text) == 0:
488 block["ocr_block"] = (
489 True # Re-OCR the block if pdftext didn't find any text
490 )
491 else:
492 block["table_text_lines"] = page_tables[table_idx]
493 table_idx += 1
494 assert table_idx == len(page_tables), (
495 "Number of tables and table inputs must match"
496 )
497
498 def align_table_cells(
499 self, table: TableResult, table_detection_result: TextDetectionResult

Callers 1

__call__Method · 0.95

Calls

no outgoing calls

Tested by

no test coverage detected