Method scrape

aider/scrape.py:98–122 · view source on GitHub ↗

Scrape a url and turn it into readable markdown if it's HTML. If it's plain text or non-HTML, return it as-is. `url` - the URL to scrape.

(self, url)

Source from the content-addressed store, hash-verified

96	self.verify_ssl = verify_ssl
97
98	def scrape(self, url):
99	"""
100	Scrape a url and turn it into readable markdown if it's HTML.
101	If it's plain text or non-HTML, return it as-is.
102
103	`url` - the URL to scrape.
104	"""
105
106	if self.playwright_available:
107	content, mime_type = self.scrape_with_playwright(url)
108	else:
109	content, mime_type = self.scrape_with_httpx(url)
110
111	if not content:
112	self.print_error(f"Failed to retrieve content from {url}")
113	return None
114
115	# Check if the content is HTML based on MIME type or content
116	if (mime_type and mime_type.startswith("text/html")) or (
117	mime_type is None and self.looks_like_html(content)
118	):
119	self.try_pandoc()
120	content = self.html_to_markdown(content)
121
122	return content
123
124	def looks_like_html(self, content):
125	"""

mainFunction · 0.95

test_scrape_actual_url_with_playwrightMethod · 0.95

test_scrape_with_playwright_error_handlingMethod · 0.95

test_scrape_text_plainMethod · 0.95

test_scrape_text_htmlMethod · 0.95

test_scraper_disable_playwright_flagFunction · 0.95

test_scraper_enable_playwrightFunction · 0.95

cmd_webMethod · 0.45

do_webMethod · 0.45

scrape_with_playwrightMethod · 0.95

scrape_with_httpxMethod · 0.95

looks_like_htmlMethod · 0.95

try_pandocMethod · 0.95

html_to_markdownMethod · 0.95

test_scrape_actual_url_with_playwrightMethod · 0.76

test_scrape_with_playwright_error_handlingMethod · 0.76

test_scrape_text_plainMethod · 0.76

test_scrape_text_htmlMethod · 0.76

test_scraper_disable_playwright_flagFunction · 0.76

test_scraper_enable_playwrightFunction · 0.76