MCPcopy
hub / github.com/jdepoix/youtube-transcript-api / _TranscriptParser

Class _TranscriptParser

youtube_transcript_api/_transcripts.py:457–492  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

455
456
457class _TranscriptParser:
458 _FORMATTING_TAGS = [
459 "strong", # important
460 "em", # emphasized
461 "b", # bold
462 "i", # italic
463 "mark", # marked
464 "small", # smaller
465 "del", # deleted
466 "ins", # inserted
467 "sub", # subscript
468 "sup", # superscript
469 ]
470
471 def __init__(self, preserve_formatting: bool = False):
472 self._html_regex = self._get_html_regex(preserve_formatting)
473
474 def _get_html_regex(self, preserve_formatting: bool) -> Pattern[str]:
475 if preserve_formatting:
476 formats_regex = "|".join(self._FORMATTING_TAGS)
477 formats_regex = r"<\/?(?!\/?(" + formats_regex + r")\b).*?\b>"
478 html_regex = re.compile(formats_regex, re.IGNORECASE)
479 else:
480 html_regex = re.compile(r"<[^>]*>", re.IGNORECASE)
481 return html_regex
482
483 def parse(self, raw_data: str) -> List[FetchedTranscriptSnippet]:
484 return [
485 FetchedTranscriptSnippet(
486 text=re.sub(self._html_regex, "", unescape(xml_element.text)),
487 start=float(xml_element.attrib["start"]),
488 duration=float(xml_element.attrib.get("dur", "0.0")),
489 )
490 for xml_element in ElementTree.fromstring(raw_data)
491 if xml_element.text is not None
492 ]

Callers 1

fetchMethod · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…