MCPcopy
hub / github.com/unclecode/crawl4ai / __init__

Method __init__

crawl4ai/crawler_strategy.py:80–155  ·  view source on GitHub ↗
(self, use_cached_html=False, js_code=None, **kwargs)

Source from the content-addressed store, hash-verified

78
79class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
80 def __init__(self, use_cached_html=False, js_code=None, **kwargs):
81 super().__init__()
82 print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
83 self.options = Options()
84 self.options.headless = True
85 if kwargs.get("proxy"):
86 self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy")))
87 if kwargs.get("user_agent"):
88 self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
89 else:
90 user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
91 self.options.add_argument(f"--user-agent={user_agent}")
92 self.options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
93
94 self.options.headless = kwargs.get("headless", True)
95 if self.options.headless:
96 self.options.add_argument("--headless")
97
98 self.options.add_argument("--disable-gpu")
99 self.options.add_argument("--window-size=1920,1080")
100 self.options.add_argument("--no-sandbox")
101 self.options.add_argument("--disable-dev-shm-usage")
102 self.options.add_argument("--disable-blink-features=AutomationControlled")
103
104 # self.options.add_argument("--disable-dev-shm-usage")
105 self.options.add_argument("--disable-gpu")
106 # self.options.add_argument("--disable-extensions")
107 # self.options.add_argument("--disable-infobars")
108 # self.options.add_argument("--disable-logging")
109 # self.options.add_argument("--disable-popup-blocking")
110 # self.options.add_argument("--disable-translate")
111 # self.options.add_argument("--disable-default-apps")
112 # self.options.add_argument("--disable-background-networking")
113 # self.options.add_argument("--disable-sync")
114 # self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
115 # self.options.add_argument("--disable-browser-side-navigation")
116 # self.options.add_argument("--dns-prefetch-disable")
117 # self.options.add_argument("--disable-web-security")
118 self.options.add_argument("--log-level=3")
119 self.use_cached_html = use_cached_html
120 self.use_cached_html = use_cached_html
121 self.js_code = js_code
122 self.verbose = kwargs.get("verbose", False)
123
124 # Hooks
125 self.hooks = {
126 'on_driver_created': None,
127 'on_user_agent_updated': None,
128 'before_get_url': None,
129 'after_get_url': None,
130 'before_return_html': None
131 }
132
133 # chromedriver_autoinstaller.install()
134 # import chromedriver_autoinstaller
135 # crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
136 # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
137 # chromedriver_path = chromedriver_autoinstaller.install()

Callers

nothing calls this directly

Calls 2

execute_hookMethod · 0.95
__init__Method · 0.45

Tested by

no test coverage detected