MCPcopy
hub / github.com/unclecode/crawl4ai / LocalSeleniumCrawlerStrategy

Class LocalSeleniumCrawlerStrategy

crawl4ai/crawler_strategy.py:79–360  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

77 return sanitize_input_encode(html)
78
79class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
80 def __init__(self, use_cached_html=False, js_code=None, **kwargs):
81 super().__init__()
82 print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
83 self.options = Options()
84 self.options.headless = True
85 if kwargs.get("proxy"):
86 self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy")))
87 if kwargs.get("user_agent"):
88 self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
89 else:
90 user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
91 self.options.add_argument(f"--user-agent={user_agent}")
92 self.options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
93
94 self.options.headless = kwargs.get("headless", True)
95 if self.options.headless:
96 self.options.add_argument("--headless")
97
98 self.options.add_argument("--disable-gpu")
99 self.options.add_argument("--window-size=1920,1080")
100 self.options.add_argument("--no-sandbox")
101 self.options.add_argument("--disable-dev-shm-usage")
102 self.options.add_argument("--disable-blink-features=AutomationControlled")
103
104 # self.options.add_argument("--disable-dev-shm-usage")
105 self.options.add_argument("--disable-gpu")
106 # self.options.add_argument("--disable-extensions")
107 # self.options.add_argument("--disable-infobars")
108 # self.options.add_argument("--disable-logging")
109 # self.options.add_argument("--disable-popup-blocking")
110 # self.options.add_argument("--disable-translate")
111 # self.options.add_argument("--disable-default-apps")
112 # self.options.add_argument("--disable-background-networking")
113 # self.options.add_argument("--disable-sync")
114 # self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
115 # self.options.add_argument("--disable-browser-side-navigation")
116 # self.options.add_argument("--dns-prefetch-disable")
117 # self.options.add_argument("--disable-web-security")
118 self.options.add_argument("--log-level=3")
119 self.use_cached_html = use_cached_html
120 self.use_cached_html = use_cached_html
121 self.js_code = js_code
122 self.verbose = kwargs.get("verbose", False)
123
124 # Hooks
125 self.hooks = {
126 'on_driver_created': None,
127 'on_user_agent_updated': None,
128 'before_get_url': None,
129 'after_get_url': None,
130 'before_return_html': None
131 }
132
133 # chromedriver_autoinstaller.install()
134 # import chromedriver_autoinstaller
135 # crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
136 # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)

Callers 4

__init__Method · 0.85
__init__Method · 0.85
using_crawler_hooksFunction · 0.85
create_crawlerFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…