| 161 | black_listed_elements = set(["html", "head", "title", "meta", "iframe", "body", "script", "style", "path", "svg", "br", "::marker",]) |
| 162 | |
| 163 | class Crawler: |
| 164 | def __init__(self): |
| 165 | self.browser = ( |
| 166 | sync_playwright() |
| 167 | .start() |
| 168 | .chromium.launch( |
| 169 | headless=False, |
| 170 | ) |
| 171 | ) |
| 172 | |
| 173 | self.page = self.browser.new_page() |
| 174 | self.page.set_viewport_size({"width": 1280, "height": 1080}) |
| 175 | |
| 176 | def go_to_page(self, url): |
| 177 | self.page.goto(url=url if "://" in url else "http://" + url) |
| 178 | self.client = self.page.context.new_cdp_session(self.page) |
| 179 | self.page_element_buffer = {} |
| 180 | |
| 181 | def scroll(self, direction): |
| 182 | if direction == "up": |
| 183 | self.page.evaluate( |
| 184 | "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop - window.innerHeight;" |
| 185 | ) |
| 186 | elif direction == "down": |
| 187 | self.page.evaluate( |
| 188 | "(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop + window.innerHeight;" |
| 189 | ) |
| 190 | |
| 191 | def click(self, id): |
| 192 | # Inject javascript into the page which removes the target= attribute from all links |
| 193 | js = """ |
| 194 | links = document.getElementsByTagName("a"); |
| 195 | for (var i = 0; i < links.length; i++) { |
| 196 | links[i].removeAttribute("target"); |
| 197 | } |
| 198 | """ |
| 199 | self.page.evaluate(js) |
| 200 | |
| 201 | element = self.page_element_buffer.get(int(id)) |
| 202 | if element: |
| 203 | x = element.get("center_x") |
| 204 | y = element.get("center_y") |
| 205 | |
| 206 | self.page.mouse.click(x, y) |
| 207 | else: |
| 208 | print("Could not find element") |
| 209 | |
| 210 | def type(self, id, text): |
| 211 | self.click(id) |
| 212 | self.page.keyboard.type(text) |
| 213 | |
| 214 | def enter(self): |
| 215 | self.page.keyboard.press("Enter") |
| 216 | |
| 217 | def crawl(self): |
| 218 | page = self.page |
| 219 | page_element_buffer = self.page_element_buffer |
| 220 | start = time.time() |