MCPcopy
hub / github.com/impira/docquery / WebDocument

Class WebDocument

src/docquery/document.py:184–234  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

182
183
184class WebDocument(Document):
185 def __init__(self, url, **kwargs):
186 if not (url.startswith("http://") or url.startswith("https://")):
187 url = "file://" + url
188 self.url = url
189
190 # TODO: This is a singleton, which is not thread-safe. We may want to relax this
191 # behavior to allow the user to pass in their own driver (which could either be a
192 # singleton or a custom instance).
193 self.driver = get_webdriver()
194
195 super().__init__(**kwargs)
196
197 def ensure_loaded(self):
198 self.driver.get(self.url)
199
200 @cached_property
201 def page_screenshots(self):
202 self.ensure_loaded()
203 return self.driver.scroll_and_screenshot()
204
205 @cached_property
206 def preview(self) -> "Image":
207 return [img.convert("RGB") for img in self.page_screenshots[1]]
208
209 @cached_property
210 def context(self) -> Dict[str, List[Tuple["Image.Image", List[Any]]]]:
211 self.ensure_loaded()
212 word_boxes = self.driver.find_word_boxes()
213
214 tops, _ = self.page_screenshots
215
216 n_pages = len(tops)
217 page = 0
218 offset = 0
219
220 words = [[] for _ in range(n_pages)]
221 boxes = [[] for _ in range(n_pages)]
222 for word_box in word_boxes["word_boxes"]:
223 box = word_box["box"]
224
225 if page < len(tops) - 1 and box["top"] >= tops[page + 1]:
226 page += 1
227 offset = tops[page]
228
229 words[page].append(word_box["text"])
230 boxes[page].append((box["left"], box["top"] - offset, box["right"], box["bottom"] - offset))
231
232 return self._generate_document_output(
233 self.preview, words, boxes, [(word_boxes["vw"], word_boxes["vh"])] * n_pages
234 )
235
236
237@validate_arguments

Callers 1

load_documentFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected