Replace typical Unicode characters in HTML for text processing. Several Unicode characters (e.g., non-printable or formatting) are typically found in HTML but are worth replacing to sanitize text and ensure consistency in text processing tasks. Args: tex
(text: str)
| 4480 | |
| 4481 | @staticmethod |
| 4482 | def _clean_unicode(text: str) -> str: |
| 4483 | """Replace typical Unicode characters in HTML for text processing. |
| 4484 | |
| 4485 | Several Unicode characters (e.g., non-printable or formatting) are typically |
| 4486 | found in HTML but are worth replacing to sanitize text and ensure consistency |
| 4487 | in text processing tasks. |
| 4488 | |
| 4489 | Args: |
| 4490 | text: The original text. |
| 4491 | |
| 4492 | Returns: |
| 4493 | The sanitized text without typical Unicode characters. |
| 4494 | """ |
| 4495 | replacements = { |
| 4496 | "\u00a0": " ", # non-breaking space |
| 4497 | "\u200b": "", # zero-width space |
| 4498 | "\u200c": "", # zero-width non-joiner |
| 4499 | "\u200d": "", # zero-width joiner |
| 4500 | "\u2010": "-", # hyphen |
| 4501 | "\u2011": "-", # non-breaking hyphen |
| 4502 | "\u2012": "-", # dash |
| 4503 | "\u2013": "-", # dash |
| 4504 | "\u2014": "-", # dash |
| 4505 | "\u2015": "-", # horizontal bar |
| 4506 | "\u2018": "'", # left single quotation mark |
| 4507 | "\u2019": "'", # right single quotation mark |
| 4508 | "\u201c": '"', # left double quotation mark |
| 4509 | "\u201d": '"', # right double quotation mark |
| 4510 | "\u2026": "...", # ellipsis |
| 4511 | "\u00ad": "", # soft hyphen |
| 4512 | "\ufeff": "", # zero width non-break space |
| 4513 | "\u202f": " ", # narrow non-break space |
| 4514 | "\u2060": "", # word joiner |
| 4515 | } |
| 4516 | for raw, clean in replacements.items(): |
| 4517 | text = text.replace(raw, clean) |
| 4518 | |
| 4519 | return text |
| 4520 | |
| 4521 | @staticmethod |
| 4522 | def _get_cell_spans(cell: Tag) -> tuple[int, int]: |
no test coverage detected