(self, cell: SuryaTableCell)
| 185 | page.structure.remove(child.id) |
| 186 | |
| 187 | def finalize_cell_text(self, cell: SuryaTableCell): |
| 188 | fixed_text = [] |
| 189 | text_lines = cell.text_lines if cell.text_lines else [] |
| 190 | for line in text_lines: |
| 191 | text = line["text"].strip() |
| 192 | if not text or text == ".": |
| 193 | continue |
| 194 | # Spaced sequences: ". . .", "- - -", "_ _ _", "… … …" |
| 195 | text = re.sub(r"(\s?[.\-_…]){2,}", "", text) |
| 196 | # Unspaced sequences: "...", "---", "___", "……" |
| 197 | text = re.sub(r"[.\-_…]{2,}", "", text) |
| 198 | # Remove mathbf formatting if there is only digits with decimals/commas/currency symbols inside |
| 199 | text = re.sub(r"\\mathbf\{([0-9.,$€£]+)\}", r"<b>\1</b>", text) |
| 200 | # Drop empty tags like \overline{} |
| 201 | text = re.sub(r"\\[a-zA-Z]+\{\s*\}", "", text) |
| 202 | # Drop \phantom{...} (remove contents too) |
| 203 | text = re.sub(r"\\phantom\{.*?\}", "", text) |
| 204 | # Drop \quad |
| 205 | text = re.sub(r"\\quad", "", text) |
| 206 | # Drop \, |
| 207 | text = re.sub(r"\\,", "", text) |
| 208 | # Unwrap \mathsf{...} |
| 209 | text = re.sub(r"\\mathsf\{([^}]*)\}", r"\1", text) |
| 210 | # Handle unclosed tags: keep contents, drop the command |
| 211 | text = re.sub(r"\\[a-zA-Z]+\{([^}]*)$", r"\1", text) |
| 212 | # If the whole string is \text{...} → unwrap |
| 213 | text = re.sub(r"^\s*\\text\{([^}]*)\}\s*$", r"\1", text) |
| 214 | |
| 215 | # In case the above steps left no more latex math - We can unwrap |
| 216 | text = unwrap_math(text) |
| 217 | text = self.normalize_spaces(fix_text(text)) |
| 218 | fixed_text.append(text) |
| 219 | return fixed_text |
| 220 | |
| 221 | @staticmethod |
| 222 | def normalize_spaces(text): |
no test coverage detected