Parse HTML content and extract text with hyperlinks in markdown format.
(html_content: Optional[str])
| 37 | await page.close() |
| 38 | |
| 39 | def parse_html(html_content: Optional[str]) -> str: |
| 40 | """Parse HTML content and extract text with hyperlinks in markdown format.""" |
| 41 | if not html_content: |
| 42 | return "" |
| 43 | |
| 44 | try: |
| 45 | document = html5lib.parse(html_content) |
| 46 | result = [] |
| 47 | seen_texts = set() # To avoid duplicates |
| 48 | |
| 49 | def should_skip_element(elem) -> bool: |
| 50 | """Check if the element should be skipped.""" |
| 51 | # Skip script and style tags |
| 52 | if elem.tag in ['{http://www.w3.org/1999/xhtml}script', |
| 53 | '{http://www.w3.org/1999/xhtml}style']: |
| 54 | return True |
| 55 | # Skip empty elements or elements with only whitespace |
| 56 | if not any(text.strip() for text in elem.itertext()): |
| 57 | return True |
| 58 | return False |
| 59 | |
| 60 | def process_element(elem, depth=0): |
| 61 | """Process an element and its children recursively.""" |
| 62 | if should_skip_element(elem): |
| 63 | return |
| 64 | |
| 65 | # Handle text content |
| 66 | if hasattr(elem, 'text') and elem.text: |
| 67 | text = elem.text.strip() |
| 68 | if text and text not in seen_texts: |
| 69 | # Check if this is an anchor tag |
| 70 | if elem.tag == '{http://www.w3.org/1999/xhtml}a': |
| 71 | href = None |
| 72 | for attr, value in elem.items(): |
| 73 | if attr.endswith('href'): |
| 74 | href = value |
| 75 | break |
| 76 | if href and not href.startswith(('#', 'javascript:')): |
| 77 | # Format as markdown link |
| 78 | link_text = f"[{text}]({href})" |
| 79 | result.append(" " * depth + link_text) |
| 80 | seen_texts.add(text) |
| 81 | else: |
| 82 | result.append(" " * depth + text) |
| 83 | seen_texts.add(text) |
| 84 | |
| 85 | # Process children |
| 86 | for child in elem: |
| 87 | process_element(child, depth + 1) |
| 88 | |
| 89 | # Handle tail text |
| 90 | if hasattr(elem, 'tail') and elem.tail: |
| 91 | tail = elem.tail.strip() |
| 92 | if tail and tail not in seen_texts: |
| 93 | result.append(" " * depth + tail) |
| 94 | seen_texts.add(tail) |
| 95 | |
| 96 | # Start processing from the body tag |