(self, url: str, **kwargs)
| 200 | return self.driver.page_source |
| 201 | |
| 202 | def crawl(self, url: str, **kwargs) -> str: |
| 203 | # Create md5 hash of the URL |
| 204 | import hashlib |
| 205 | url_hash = hashlib.md5(url.encode()).hexdigest() |
| 206 | |
| 207 | if self.use_cached_html: |
| 208 | cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) |
| 209 | if os.path.exists(cache_file_path): |
| 210 | with open(cache_file_path, "r") as f: |
| 211 | return sanitize_input_encode(f.read()) |
| 212 | |
| 213 | try: |
| 214 | self.driver = self.execute_hook('before_get_url', self.driver) |
| 215 | if self.verbose: |
| 216 | print(f"[LOG] 🕸️ Crawling {url} using LocalSeleniumCrawlerStrategy...") |
| 217 | self.driver.get(url) #<html><head></head><body></body></html> |
| 218 | |
| 219 | WebDriverWait(self.driver, 20).until( |
| 220 | lambda d: d.execute_script('return document.readyState') == 'complete' |
| 221 | ) |
| 222 | WebDriverWait(self.driver, 10).until( |
| 223 | EC.presence_of_all_elements_located((By.TAG_NAME, "body")) |
| 224 | ) |
| 225 | |
| 226 | self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") |
| 227 | |
| 228 | self.driver = self.execute_hook('after_get_url', self.driver) |
| 229 | html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source |
| 230 | can_not_be_done_headless = False # Look at my creativity for naming variables |
| 231 | |
| 232 | # TODO: Very ugly approach, but promise to change it! |
| 233 | if kwargs.get('bypass_headless', False) or html == "<html><head></head><body></body></html>": |
| 234 | print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...") |
| 235 | can_not_be_done_headless = True |
| 236 | options = Options() |
| 237 | options.headless = False |
| 238 | # set window size very small |
| 239 | options.add_argument("--window-size=5,5") |
| 240 | driver = webdriver.Chrome(service=self.service, options=options) |
| 241 | driver.get(url) |
| 242 | self.driver = self.execute_hook('after_get_url', driver) |
| 243 | html = sanitize_input_encode(driver.page_source) |
| 244 | driver.quit() |
| 245 | |
| 246 | # Execute JS code if provided |
| 247 | self.js_code = kwargs.get("js_code", self.js_code) |
| 248 | if self.js_code and type(self.js_code) == str: |
| 249 | self.driver.execute_script(self.js_code) |
| 250 | # Optionally, wait for some condition after executing the JS code |
| 251 | WebDriverWait(self.driver, 10).until( |
| 252 | lambda driver: driver.execute_script("return document.readyState") == "complete" |
| 253 | ) |
| 254 | elif self.js_code and type(self.js_code) == list: |
| 255 | for js in self.js_code: |
| 256 | self.driver.execute_script(js) |
| 257 | WebDriverWait(self.driver, 10).until( |
| 258 | lambda driver: driver.execute_script("return document.readyState") == "complete" |
| 259 | ) |
nothing calls this directly
no test coverage detected