hub / github.com/grapeot/devin.cursorrules / parse_html

Function parse_html

tools/web_scraper.py:39–124 · view source on GitHub ↗

Parse HTML content and extract text with hyperlinks in markdown format.

(html_content: Optional[str])

Source from the content-addressed store, hash-verified

37	await page.close()
38
39	def parse_html(html_content: Optional[str]) -> str:
40	"""Parse HTML content and extract text with hyperlinks in markdown format."""
41	if not html_content:
42	return ""
43
44	try:
45	document = html5lib.parse(html_content)
46	result = []
47	seen_texts = set() # To avoid duplicates
48
49	def should_skip_element(elem) -> bool:
50	"""Check if the element should be skipped."""
51	# Skip script and style tags
52	if elem.tag in ['{http://www.w3.org/1999/xhtml}script',
53	'{http://www.w3.org/1999/xhtml}style']:
54	return True
55	# Skip empty elements or elements with only whitespace
56	if not any(text.strip() for text in elem.itertext()):
57	return True
58	return False
59
60	def process_element(elem, depth=0):
61	"""Process an element and its children recursively."""
62	if should_skip_element(elem):
63	return
64
65	# Handle text content
66	if hasattr(elem, 'text') and elem.text:
67	text = elem.text.strip()
68	if text and text not in seen_texts:
69	# Check if this is an anchor tag
70	if elem.tag == '{http://www.w3.org/1999/xhtml}a':
71	href = None
72	for attr, value in elem.items():
73	if attr.endswith('href'):
74	href = value
75	break
76	if href and not href.startswith(('#', 'javascript:')):
77	# Format as markdown link
78	link_text = f"[{text}]({href})"
79	result.append(" " * depth + link_text)
80	seen_texts.add(text)
81	else:
82	result.append(" " * depth + text)
83	seen_texts.add(text)
84
85	# Process children
86	for child in elem:
87	process_element(child, depth + 1)
88
89	# Handle tail text
90	if hasattr(elem, 'tail') and elem.tail:
91	tail = elem.tail.strip()
92	if tail and tail not in seen_texts:
93	result.append(" " * depth + tail)
94	seen_texts.add(tail)
95
96	# Start processing from the body tag

Callers 1

test_parse_htmlMethod · 0.90

Calls 1

process_elementFunction · 0.85

Tested by 1

test_parse_htmlMethod · 0.72