| 455 | |
| 456 | |
| 457 | class _TranscriptParser: |
| 458 | _FORMATTING_TAGS = [ |
| 459 | "strong", # important |
| 460 | "em", # emphasized |
| 461 | "b", # bold |
| 462 | "i", # italic |
| 463 | "mark", # marked |
| 464 | "small", # smaller |
| 465 | "del", # deleted |
| 466 | "ins", # inserted |
| 467 | "sub", # subscript |
| 468 | "sup", # superscript |
| 469 | ] |
| 470 | |
| 471 | def __init__(self, preserve_formatting: bool = False): |
| 472 | self._html_regex = self._get_html_regex(preserve_formatting) |
| 473 | |
| 474 | def _get_html_regex(self, preserve_formatting: bool) -> Pattern[str]: |
| 475 | if preserve_formatting: |
| 476 | formats_regex = "|".join(self._FORMATTING_TAGS) |
| 477 | formats_regex = r"<\/?(?!\/?(" + formats_regex + r")\b).*?\b>" |
| 478 | html_regex = re.compile(formats_regex, re.IGNORECASE) |
| 479 | else: |
| 480 | html_regex = re.compile(r"<[^>]*>", re.IGNORECASE) |
| 481 | return html_regex |
| 482 | |
| 483 | def parse(self, raw_data: str) -> List[FetchedTranscriptSnippet]: |
| 484 | return [ |
| 485 | FetchedTranscriptSnippet( |
| 486 | text=re.sub(self._html_regex, "", unescape(xml_element.text)), |
| 487 | start=float(xml_element.attrib["start"]), |
| 488 | duration=float(xml_element.attrib.get("dur", "0.0")), |
| 489 | ) |
| 490 | for xml_element in ElementTree.fromstring(raw_data) |
| 491 | if xml_element.text is not None |
| 492 | ] |
no outgoing calls
no test coverage detected
searching dependent graphs…