MCPcopy
hub / github.com/unclecode/crawl4ai / crawl

Method crawl

crawl4ai/crawler_strategy.py:202–298  ·  view source on GitHub ↗
(self, url: str, **kwargs)

Source from the content-addressed store, hash-verified

200 return self.driver.page_source
201
202 def crawl(self, url: str, **kwargs) -> str:
203 # Create md5 hash of the URL
204 import hashlib
205 url_hash = hashlib.md5(url.encode()).hexdigest()
206
207 if self.use_cached_html:
208 cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
209 if os.path.exists(cache_file_path):
210 with open(cache_file_path, "r") as f:
211 return sanitize_input_encode(f.read())
212
213 try:
214 self.driver = self.execute_hook('before_get_url', self.driver)
215 if self.verbose:
216 print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...")
217 self.driver.get(url) #<html><head></head><body></body></html>
218
219 WebDriverWait(self.driver, 20).until(
220 lambda d: d.execute_script('return document.readyState') == 'complete'
221 )
222 WebDriverWait(self.driver, 10).until(
223 EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
224 )
225
226 self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
227
228 self.driver = self.execute_hook('after_get_url', self.driver)
229 html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source
230 can_not_be_done_headless = False # Look at my creativity for naming variables
231
232 # TODO: Very ugly approach, but promise to change it!
233 if kwargs.get('bypass_headless', False) or html == "<html><head></head><body></body></html>":
234 print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
235 can_not_be_done_headless = True
236 options = Options()
237 options.headless = False
238 # set window size very small
239 options.add_argument("--window-size=5,5")
240 driver = webdriver.Chrome(service=self.service, options=options)
241 driver.get(url)
242 self.driver = self.execute_hook('after_get_url', driver)
243 html = sanitize_input_encode(driver.page_source)
244 driver.quit()
245
246 # Execute JS code if provided
247 self.js_code = kwargs.get("js_code", self.js_code)
248 if self.js_code and type(self.js_code) == str:
249 self.driver.execute_script(self.js_code)
250 # Optionally, wait for some condition after executing the JS code
251 WebDriverWait(self.driver, 10).until(
252 lambda driver: driver.execute_script("return document.readyState") == "complete"
253 )
254 elif self.js_code and type(self.js_code) == list:
255 for js in self.js_code:
256 self.driver.execute_script(js)
257 WebDriverWait(self.driver, 10).until(
258 lambda driver: driver.execute_script("return document.readyState") == "complete"
259 )

Callers

nothing calls this directly

Calls 4

execute_hookMethod · 0.95
_ensure_page_loadMethod · 0.95
sanitize_input_encodeFunction · 0.85
quitMethod · 0.80

Tested by

no test coverage detected