Class Scraper

aider/scrape.py:79–250 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

77
78
79	class Scraper:
80	pandoc_available = None
81	playwright_available = None
82	playwright_instructions_shown = False
83
84	# Public API...
85	def __init__(self, print_error=None, playwright_available=None, verify_ssl=True):
86	"""
87	`print_error` - a function to call to print error/debug info.
88	`verify_ssl` - if False, disable SSL certificate verification when scraping.
89	"""
90	if print_error:
91	self.print_error = print_error
92	else:
93	self.print_error = print
94
95	self.playwright_available = playwright_available
96	self.verify_ssl = verify_ssl
97
98	def scrape(self, url):
99	"""
100	Scrape a url and turn it into readable markdown if it's HTML.
101	If it's plain text or non-HTML, return it as-is.
102
103	`url` - the URL to scrape.
104	"""
105
106	if self.playwright_available:
107	content, mime_type = self.scrape_with_playwright(url)
108	else:
109	content, mime_type = self.scrape_with_httpx(url)
110
111	if not content:
112	self.print_error(f"Failed to retrieve content from {url}")
113	return None
114
115	# Check if the content is HTML based on MIME type or content
116	if (mime_type and mime_type.startswith("text/html")) or (
117	mime_type is None and self.looks_like_html(content)
118	):
119	self.try_pandoc()
120	content = self.html_to_markdown(content)
121
122	return content
123
124	def looks_like_html(self, content):
125	"""
126	Check if the content looks like HTML.
127	"""
128	if isinstance(content, str):
129	# Check for common HTML tags
130	html_patterns = [
131	r"<!DOCTYPE\s+html",
132	r"<html",
133	r"<head",
134	r"<body",
135	r"<div",
136	r"<p>",

cmd_webMethod · 0.90

do_webMethod · 0.90

test_scrape_self_signed_sslMethod · 0.90

test_scrape_actual_url_with_playwrightMethod · 0.90

test_scraper_print_error_not_calledMethod · 0.90

test_scrape_with_playwright_error_handlingMethod · 0.90

test_scrape_text_plainMethod · 0.90

test_scrape_text_htmlMethod · 0.90

test_scraper_disable_playwright_flagFunction · 0.90

test_scraper_enable_playwrightFunction · 0.90

mainFunction · 0.85

no outgoing calls

test_scrape_self_signed_sslMethod · 0.72

test_scrape_actual_url_with_playwrightMethod · 0.72

test_scraper_print_error_not_calledMethod · 0.72

test_scrape_with_playwright_error_handlingMethod · 0.72

test_scrape_text_plainMethod · 0.72

test_scrape_text_htmlMethod · 0.72

test_scraper_disable_playwright_flagFunction · 0.72

test_scraper_enable_playwrightFunction · 0.72