Method assign_ocr_lines

marker/processors/table.py:671–694 · view source on GitHub ↗

(self, tables: List[TableResult], table_blocks: list)

Source from the content-addressed store, hash-verified

669	return ocr_results
670
671	def assign_ocr_lines(self, tables: List[TableResult], table_blocks: list):
672	ocr_tables, ocr_polys, ocr_idxs = self.needs_ocr(tables, table_blocks)
673	det_images = [
674	t["table_image"] for i, t in enumerate(table_blocks) if i in ocr_idxs
675	]
676	assert len(det_images) == len(ocr_polys), (
677	f"Number of detection images and OCR polygons must match: {len(det_images)} != {len(ocr_polys)}"
678	)
679	self.recognition_model.disable_tqdm = self.disable_tqdm
680	ocr_results = self.get_ocr_results(table_images=det_images, ocr_polys=ocr_polys)
681
682	for result, ocr_res in zip(ocr_tables, ocr_results):
683	table_cells: List[SuryaTableCell] = result.cells
684	cells_need_text = [tc for tc in table_cells if tc.text_lines is None]
685
686	assert len(cells_need_text) == len(ocr_res.text_lines), (
687	"Number of cells needing text and OCR results must match"
688	)
689
690	for cell_text, cell_needs_text in zip(ocr_res.text_lines, cells_need_text):
691	# Don't need to correct back to image size
692	# Table rec boxes are relative to the table
693	cell_text_lines = [{"text": t} for t in cell_text.text.split("<br>")]
694	cell_needs_text.text_lines = cell_text_lines
695
696	def get_table_rec_batch_size(self):
697	if self.table_rec_batch_size is not None:

__call__Method · 0.95

needs_ocrMethod · 0.95

get_ocr_resultsMethod · 0.95

no test coverage detected