(self, use_cached_html=False, js_code=None, **kwargs)
| 78 | |
| 79 | class LocalSeleniumCrawlerStrategy(CrawlerStrategy): |
| 80 | def __init__(self, use_cached_html=False, js_code=None, **kwargs): |
| 81 | super().__init__() |
| 82 | print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy") |
| 83 | self.options = Options() |
| 84 | self.options.headless = True |
| 85 | if kwargs.get("proxy"): |
| 86 | self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy"))) |
| 87 | if kwargs.get("user_agent"): |
| 88 | self.options.add_argument("--user-agent=" + kwargs.get("user_agent")) |
| 89 | else: |
| 90 | user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") |
| 91 | self.options.add_argument(f"--user-agent={user_agent}") |
| 92 | self.options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") |
| 93 | |
| 94 | self.options.headless = kwargs.get("headless", True) |
| 95 | if self.options.headless: |
| 96 | self.options.add_argument("--headless") |
| 97 | |
| 98 | self.options.add_argument("--disable-gpu") |
| 99 | self.options.add_argument("--window-size=1920,1080") |
| 100 | self.options.add_argument("--no-sandbox") |
| 101 | self.options.add_argument("--disable-dev-shm-usage") |
| 102 | self.options.add_argument("--disable-blink-features=AutomationControlled") |
| 103 | |
| 104 | # self.options.add_argument("--disable-dev-shm-usage") |
| 105 | self.options.add_argument("--disable-gpu") |
| 106 | # self.options.add_argument("--disable-extensions") |
| 107 | # self.options.add_argument("--disable-infobars") |
| 108 | # self.options.add_argument("--disable-logging") |
| 109 | # self.options.add_argument("--disable-popup-blocking") |
| 110 | # self.options.add_argument("--disable-translate") |
| 111 | # self.options.add_argument("--disable-default-apps") |
| 112 | # self.options.add_argument("--disable-background-networking") |
| 113 | # self.options.add_argument("--disable-sync") |
| 114 | # self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess") |
| 115 | # self.options.add_argument("--disable-browser-side-navigation") |
| 116 | # self.options.add_argument("--dns-prefetch-disable") |
| 117 | # self.options.add_argument("--disable-web-security") |
| 118 | self.options.add_argument("--log-level=3") |
| 119 | self.use_cached_html = use_cached_html |
| 120 | self.use_cached_html = use_cached_html |
| 121 | self.js_code = js_code |
| 122 | self.verbose = kwargs.get("verbose", False) |
| 123 | |
| 124 | # Hooks |
| 125 | self.hooks = { |
| 126 | 'on_driver_created': None, |
| 127 | 'on_user_agent_updated': None, |
| 128 | 'before_get_url': None, |
| 129 | 'after_get_url': None, |
| 130 | 'before_return_html': None |
| 131 | } |
| 132 | |
| 133 | # chromedriver_autoinstaller.install() |
| 134 | # import chromedriver_autoinstaller |
| 135 | # crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") |
| 136 | # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options) |
| 137 | # chromedriver_path = chromedriver_autoinstaller.install() |
nothing calls this directly
no test coverage detected