MCPcopy
hub / github.com/datalab-to/marker / __call__

Method __call__

marker/processors/table.py:85–185  ·  view source on GitHub ↗
(self, document: Document)

Source from the content-addressed store, hash-verified

83 self.detection_model = detection_model
84
85 def __call__(self, document: Document):
86 filepath = document.filepath # Path to original pdf file
87
88 table_data = []
89 for page in document.pages:
90 for block in page.contained_blocks(document, self.block_types):
91 if block.block_type == BlockTypes.Table:
92 block.polygon = block.polygon.expand(0.01, 0.01)
93 image = block.get_image(document, highres=True)
94 image_poly = block.polygon.rescale(
95 (page.polygon.width, page.polygon.height),
96 page.get_image(highres=True).size,
97 )
98
99 table_data.append(
100 {
101 "block_id": block.id,
102 "page_id": page.page_id,
103 "table_image": image,
104 "table_bbox": image_poly.bbox,
105 "img_size": page.get_image(highres=True).size,
106 "ocr_block": any(
107 [
108 page.text_extraction_method in ["surya"],
109 page.ocr_errors_detected,
110 ]
111 ),
112 }
113 )
114
115 # Detect tables and cells
116 self.table_rec_model.disable_tqdm = self.disable_tqdm
117 tables: List[TableResult] = self.table_rec_model(
118 [t["table_image"] for t in table_data],
119 batch_size=self.get_table_rec_batch_size(),
120 )
121 assert len(tables) == len(table_data), (
122 "Number of table results should match the number of tables"
123 )
124
125 # Assign cell text if we don't need OCR
126 # We do this at a line level
127 extract_blocks = [t for t in table_data if not t["ocr_block"]]
128 self.assign_pdftext_lines(
129 extract_blocks, filepath
130 ) # Handle tables where good text exists in the PDF
131 self.assign_text_to_cells(tables, table_data)
132
133 # Assign OCR lines if needed - we do this at a cell level
134 self.assign_ocr_lines(tables, table_data)
135
136 self.split_combined_rows(tables) # Split up rows that were combined
137 self.combine_dollar_column(tables) # Combine columns that are just dollar signs
138
139 # Assign table cells to the table
140 table_idx = 0
141 for page in document.pages:
142 for block in page.contained_blocks(document, self.block_types):

Callers

nothing calls this directly

Calls 15

assign_pdftext_linesMethod · 0.95
assign_text_to_cellsMethod · 0.95
assign_ocr_linesMethod · 0.95
split_combined_rowsMethod · 0.95
combine_dollar_columnMethod · 0.95
finalize_cell_textMethod · 0.95
PolygonBoxClass · 0.90
TableCellClass · 0.90
matrix_intersection_areaFunction · 0.90
expandMethod · 0.80
rescaleMethod · 0.80

Tested by

no test coverage detected