hub / github.com/impira/docquery / WebDocument

Class WebDocument

src/docquery/document.py:184–234 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

182
183
184	class WebDocument(Document):
185	def __init__(self, url, **kwargs):
186	if not (url.startswith("http://") or url.startswith("https://")):
187	url = "file://" + url
188	self.url = url
189
190	# TODO: This is a singleton, which is not thread-safe. We may want to relax this
191	# behavior to allow the user to pass in their own driver (which could either be a
192	# singleton or a custom instance).
193	self.driver = get_webdriver()
194
195	super().__init__(**kwargs)
196
197	def ensure_loaded(self):
198	self.driver.get(self.url)
199
200	@cached_property
201	def page_screenshots(self):
202	self.ensure_loaded()
203	return self.driver.scroll_and_screenshot()
204
205	@cached_property
206	def preview(self) -> "Image":
207	return [img.convert("RGB") for img in self.page_screenshots[1]]
208
209	@cached_property
210	def context(self) -> Dict[str, List[Tuple["Image.Image", List[Any]]]]:
211	self.ensure_loaded()
212	word_boxes = self.driver.find_word_boxes()
213
214	tops, _ = self.page_screenshots
215
216	n_pages = len(tops)
217	page = 0
218	offset = 0
219
220	words = [[] for _ in range(n_pages)]
221	boxes = [[] for _ in range(n_pages)]
222	for word_box in word_boxes["word_boxes"]:
223	box = word_box["box"]
224
225	if page < len(tops) - 1 and box["top"] >= tops[page + 1]:
226	page += 1
227	offset = tops[page]
228
229	words[page].append(word_box["text"])
230	boxes[page].append((box["left"], box["top"] - offset, box["right"], box["bottom"] - offset))
231
232	return self._generate_document_output(
233	self.preview, words, boxes, [(word_boxes["vw"], word_boxes["vh"])] * n_pages
234	)
235
236
237	@validate_arguments

Callers 1

load_documentFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected