MCPcopy
hub / github.com/Aider-AI/aider / Scraper

Class Scraper

aider/scrape.py:79–250  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

77
78
79class Scraper:
80 pandoc_available = None
81 playwright_available = None
82 playwright_instructions_shown = False
83
84 # Public API...
85 def __init__(self, print_error=None, playwright_available=None, verify_ssl=True):
86 """
87 `print_error` - a function to call to print error/debug info.
88 `verify_ssl` - if False, disable SSL certificate verification when scraping.
89 """
90 if print_error:
91 self.print_error = print_error
92 else:
93 self.print_error = print
94
95 self.playwright_available = playwright_available
96 self.verify_ssl = verify_ssl
97
98 def scrape(self, url):
99 """
100 Scrape a url and turn it into readable markdown if it's HTML.
101 If it's plain text or non-HTML, return it as-is.
102
103 `url` - the URL to scrape.
104 """
105
106 if self.playwright_available:
107 content, mime_type = self.scrape_with_playwright(url)
108 else:
109 content, mime_type = self.scrape_with_httpx(url)
110
111 if not content:
112 self.print_error(f"Failed to retrieve content from {url}")
113 return None
114
115 # Check if the content is HTML based on MIME type or content
116 if (mime_type and mime_type.startswith("text/html")) or (
117 mime_type is None and self.looks_like_html(content)
118 ):
119 self.try_pandoc()
120 content = self.html_to_markdown(content)
121
122 return content
123
124 def looks_like_html(self, content):
125 """
126 Check if the content looks like HTML.
127 """
128 if isinstance(content, str):
129 # Check for common HTML tags
130 html_patterns = [
131 r"<!DOCTYPE\s+html",
132 r"<html",
133 r"<head",
134 r"<body",
135 r"<div",
136 r"<p>",

Calls

no outgoing calls