| 182 | |
| 183 | |
| 184 | class WebDocument(Document): |
| 185 | def __init__(self, url, **kwargs): |
| 186 | if not (url.startswith("http://") or url.startswith("https://")): |
| 187 | url = "file://" + url |
| 188 | self.url = url |
| 189 | |
| 190 | # TODO: This is a singleton, which is not thread-safe. We may want to relax this |
| 191 | # behavior to allow the user to pass in their own driver (which could either be a |
| 192 | # singleton or a custom instance). |
| 193 | self.driver = get_webdriver() |
| 194 | |
| 195 | super().__init__(**kwargs) |
| 196 | |
| 197 | def ensure_loaded(self): |
| 198 | self.driver.get(self.url) |
| 199 | |
| 200 | @cached_property |
| 201 | def page_screenshots(self): |
| 202 | self.ensure_loaded() |
| 203 | return self.driver.scroll_and_screenshot() |
| 204 | |
| 205 | @cached_property |
| 206 | def preview(self) -> "Image": |
| 207 | return [img.convert("RGB") for img in self.page_screenshots[1]] |
| 208 | |
| 209 | @cached_property |
| 210 | def context(self) -> Dict[str, List[Tuple["Image.Image", List[Any]]]]: |
| 211 | self.ensure_loaded() |
| 212 | word_boxes = self.driver.find_word_boxes() |
| 213 | |
| 214 | tops, _ = self.page_screenshots |
| 215 | |
| 216 | n_pages = len(tops) |
| 217 | page = 0 |
| 218 | offset = 0 |
| 219 | |
| 220 | words = [[] for _ in range(n_pages)] |
| 221 | boxes = [[] for _ in range(n_pages)] |
| 222 | for word_box in word_boxes["word_boxes"]: |
| 223 | box = word_box["box"] |
| 224 | |
| 225 | if page < len(tops) - 1 and box["top"] >= tops[page + 1]: |
| 226 | page += 1 |
| 227 | offset = tops[page] |
| 228 | |
| 229 | words[page].append(word_box["text"]) |
| 230 | boxes[page].append((box["left"], box["top"] - offset, box["right"], box["bottom"] - offset)) |
| 231 | |
| 232 | return self._generate_document_output( |
| 233 | self.preview, words, boxes, [(word_boxes["vw"], word_boxes["vh"])] * n_pages |
| 234 | ) |
| 235 | |
| 236 | |
| 237 | @validate_arguments |