MCPcopy
hub / github.com/datalab-to/marker / finalize_cell_text

Method finalize_cell_text

marker/processors/table.py:187–219  ·  view source on GitHub ↗
(self, cell: SuryaTableCell)

Source from the content-addressed store, hash-verified

185 page.structure.remove(child.id)
186
187 def finalize_cell_text(self, cell: SuryaTableCell):
188 fixed_text = []
189 text_lines = cell.text_lines if cell.text_lines else []
190 for line in text_lines:
191 text = line["text"].strip()
192 if not text or text == ".":
193 continue
194 # Spaced sequences: ". . .", "- - -", "_ _ _", "… … …"
195 text = re.sub(r"(\s?[.\-_…]){2,}", "", text)
196 # Unspaced sequences: "...", "---", "___", "……"
197 text = re.sub(r"[.\-_…]{2,}", "", text)
198 # Remove mathbf formatting if there is only digits with decimals/commas/currency symbols inside
199 text = re.sub(r"\\mathbf\{([0-9.,$€£]+)\}", r"<b>\1</b>", text)
200 # Drop empty tags like \overline{}
201 text = re.sub(r"\\[a-zA-Z]+\{\s*\}", "", text)
202 # Drop \phantom{...} (remove contents too)
203 text = re.sub(r"\\phantom\{.*?\}", "", text)
204 # Drop \quad
205 text = re.sub(r"\\quad", "", text)
206 # Drop \,
207 text = re.sub(r"\\,", "", text)
208 # Unwrap \mathsf{...}
209 text = re.sub(r"\\mathsf\{([^}]*)\}", r"\1", text)
210 # Handle unclosed tags: keep contents, drop the command
211 text = re.sub(r"\\[a-zA-Z]+\{([^}]*)$", r"\1", text)
212 # If the whole string is \text{...} → unwrap
213 text = re.sub(r"^\s*\\text\{([^}]*)\}\s*$", r"\1", text)
214
215 # In case the above steps left no more latex math - We can unwrap
216 text = unwrap_math(text)
217 text = self.normalize_spaces(fix_text(text))
218 fixed_text.append(text)
219 return fixed_text
220
221 @staticmethod
222 def normalize_spaces(text):

Callers 2

__call__Method · 0.95
combine_dollar_columnMethod · 0.95

Calls 2

normalize_spacesMethod · 0.95
unwrap_mathFunction · 0.90

Tested by

no test coverage detected