Method assign_pdftext_lines

marker/processors/table.py:457–496 · view source on GitHub ↗

(self, extract_blocks: list, filepath: str)

Source from the content-addressed store, hash-verified

455	table_cells[k].text_lines = text
456
457	def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
458	table_inputs = []
459	unique_pages = list(set([t["page_id"] for t in extract_blocks]))
460	if len(unique_pages) == 0:
461	return
462
463	for page in unique_pages:
464	tables = []
465	img_size = None
466	for block in extract_blocks:
467	if block["page_id"] == page:
468	tables.append(block["table_bbox"])
469	img_size = block["img_size"]
470
471	table_inputs.append({"tables": tables, "img_size": img_size})
472	cell_text = table_output(
473	filepath,
474	table_inputs,
475	page_range=unique_pages,
476	workers=self.pdftext_workers,
477	)
478	assert len(cell_text) == len(unique_pages), (
479	"Number of pages and table inputs must match"
480	)
481
482	for pidx, (page_tables, pnum) in enumerate(zip(cell_text, unique_pages)):
483	table_idx = 0
484	for block in extract_blocks:
485	if block["page_id"] == pnum:
486	table_text = page_tables[table_idx]
487	if len(table_text) == 0:
488	block["ocr_block"] = (
489	True # Re-OCR the block if pdftext didn't find any text
490	)
491	else:
492	block["table_text_lines"] = page_tables[table_idx]
493	table_idx += 1
494	assert table_idx == len(page_tables), (
495	"Number of tables and table inputs must match"
496	)
497
498	def align_table_cells(
499	self, table: TableResult, table_detection_result: TextDetectionResult

__call__Method · 0.95

no outgoing calls

no test coverage detected