MCPcopy
hub / github.com/datalab-to/marker / assign_ocr_lines

Method assign_ocr_lines

marker/processors/table.py:671–694  ·  view source on GitHub ↗
(self, tables: List[TableResult], table_blocks: list)

Source from the content-addressed store, hash-verified

669 return ocr_results
670
671 def assign_ocr_lines(self, tables: List[TableResult], table_blocks: list):
672 ocr_tables, ocr_polys, ocr_idxs = self.needs_ocr(tables, table_blocks)
673 det_images = [
674 t["table_image"] for i, t in enumerate(table_blocks) if i in ocr_idxs
675 ]
676 assert len(det_images) == len(ocr_polys), (
677 f"Number of detection images and OCR polygons must match: {len(det_images)} != {len(ocr_polys)}"
678 )
679 self.recognition_model.disable_tqdm = self.disable_tqdm
680 ocr_results = self.get_ocr_results(table_images=det_images, ocr_polys=ocr_polys)
681
682 for result, ocr_res in zip(ocr_tables, ocr_results):
683 table_cells: List[SuryaTableCell] = result.cells
684 cells_need_text = [tc for tc in table_cells if tc.text_lines is None]
685
686 assert len(cells_need_text) == len(ocr_res.text_lines), (
687 "Number of cells needing text and OCR results must match"
688 )
689
690 for cell_text, cell_needs_text in zip(ocr_res.text_lines, cells_need_text):
691 # Don't need to correct back to image size
692 # Table rec boxes are relative to the table
693 cell_text_lines = [{"text": t} for t in cell_text.text.split("<br>")]
694 cell_needs_text.text_lines = cell_text_lines
695
696 def get_table_rec_batch_size(self):
697 if self.table_rec_batch_size is not None:

Callers 1

__call__Method · 0.95

Calls 2

needs_ocrMethod · 0.95
get_ocr_resultsMethod · 0.95

Tested by

no test coverage detected