MCPcopy Index your code
hub / github.com/grapeot/devin.cursorrules / parse_html

Function parse_html

tools/web_scraper.py:39–124  ·  view source on GitHub ↗

Parse HTML content and extract text with hyperlinks in markdown format.

(html_content: Optional[str])

Source from the content-addressed store, hash-verified

37 await page.close()
38
39def parse_html(html_content: Optional[str]) -> str:
40 """Parse HTML content and extract text with hyperlinks in markdown format."""
41 if not html_content:
42 return ""
43
44 try:
45 document = html5lib.parse(html_content)
46 result = []
47 seen_texts = set() # To avoid duplicates
48
49 def should_skip_element(elem) -> bool:
50 """Check if the element should be skipped."""
51 # Skip script and style tags
52 if elem.tag in ['{http://www.w3.org/1999/xhtml}script',
53 '{http://www.w3.org/1999/xhtml}style']:
54 return True
55 # Skip empty elements or elements with only whitespace
56 if not any(text.strip() for text in elem.itertext()):
57 return True
58 return False
59
60 def process_element(elem, depth=0):
61 """Process an element and its children recursively."""
62 if should_skip_element(elem):
63 return
64
65 # Handle text content
66 if hasattr(elem, 'text') and elem.text:
67 text = elem.text.strip()
68 if text and text not in seen_texts:
69 # Check if this is an anchor tag
70 if elem.tag == '{http://www.w3.org/1999/xhtml}a':
71 href = None
72 for attr, value in elem.items():
73 if attr.endswith('href'):
74 href = value
75 break
76 if href and not href.startswith(('#', 'javascript:')):
77 # Format as markdown link
78 link_text = f"[{text}]({href})"
79 result.append(" " * depth + link_text)
80 seen_texts.add(text)
81 else:
82 result.append(" " * depth + text)
83 seen_texts.add(text)
84
85 # Process children
86 for child in elem:
87 process_element(child, depth + 1)
88
89 # Handle tail text
90 if hasattr(elem, 'tail') and elem.tail:
91 tail = elem.tail.strip()
92 if tail and tail not in seen_texts:
93 result.append(" " * depth + tail)
94 seen_texts.add(tail)
95
96 # Start processing from the body tag

Callers 1

test_parse_htmlMethod · 0.90

Calls 1

process_elementFunction · 0.85

Tested by 1

test_parse_htmlMethod · 0.72