Prepares text for a TTS API by cleaning Markdown and adding minimal contextual hints for certain Markdown elements like headers. Preserves paragraph separation. Args: text (str): The raw text containing Markdown or other formatting. Returns: str: Cleaned text with
(text: str)
| 2 | import emoji |
| 3 | |
| 4 | def prepare_tts_input_with_context(text: str) -> str: |
| 5 | """ |
| 6 | Prepares text for a TTS API by cleaning Markdown and adding minimal contextual hints |
| 7 | for certain Markdown elements like headers. Preserves paragraph separation. |
| 8 | |
| 9 | Args: |
| 10 | text (str): The raw text containing Markdown or other formatting. |
| 11 | |
| 12 | Returns: |
| 13 | str: Cleaned text with contextual hints suitable for TTS input. |
| 14 | """ |
| 15 | |
| 16 | # Remove emojis |
| 17 | text = emoji.replace_emoji(text, replace='') |
| 18 | |
| 19 | # Add context for headers |
| 20 | def header_replacer(match): |
| 21 | level = len(match.group(1)) # Number of '#' symbols |
| 22 | header_text = match.group(2).strip() |
| 23 | if level == 1: |
| 24 | return f"Title — {header_text}\n" |
| 25 | elif level == 2: |
| 26 | return f"Section — {header_text}\n" |
| 27 | else: |
| 28 | return f"Subsection — {header_text}\n" |
| 29 | |
| 30 | text = re.sub(r"^(#{1,6})\s+(.*)", header_replacer, text, flags=re.MULTILINE) |
| 31 | |
| 32 | # Announce links (currently commented out for potential future use) |
| 33 | # text = re.sub(r"\[([^\]]+)\]\((https?:\/\/[^\)]+)\)", r"\1 (link: \2)", text) |
| 34 | |
| 35 | # Remove links while keeping the link text |
| 36 | text = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text) |
| 37 | |
| 38 | # Describe inline code |
| 39 | text = re.sub(r"`([^`]+)`", r"code snippet: \1", text) |
| 40 | |
| 41 | # Remove bold/italic symbols but keep the content |
| 42 | text = re.sub(r"(\*\*|__|\*|_)", '', text) |
| 43 | |
| 44 | # Remove code blocks (multi-line) with a description |
| 45 | text = re.sub(r"```([\s\S]+?)```", r"(code block omitted)", text) |
| 46 | |
| 47 | # Remove image syntax but add alt text if available |
| 48 | text = re.sub(r"!\[([^\]]*)\]\([^\)]+\)", r"Image: \1", text) |
| 49 | |
| 50 | # Remove HTML tags |
| 51 | text = re.sub(r"</?[^>]+(>|$)", '', text) |
| 52 | |
| 53 | # Normalize line breaks |
| 54 | text = re.sub(r"\n{2,}", '\n\n', text) # Ensure consistent paragraph separation |
| 55 | |
| 56 | # Replace multiple spaces within lines |
| 57 | text = re.sub(r" {2,}", ' ', text) |
| 58 | |
| 59 | # Trim leading and trailing whitespace from the whole text |
| 60 | text = text.strip() |
| 61 |
no outgoing calls
no test coverage detected