MCPcopy
hub / github.com/impira/docquery / context

Method context

src/docquery/document.py:112–144  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

110
111 @cached_property
112 def context(self) -> Dict[str, List[Tuple["Image.Image", List[Any]]]]:
113 pdf = self._pdf
114 if pdf is None:
115 return {}
116
117 images = self._images
118
119 if len(images) != len(pdf.pages):
120 raise ValueError(
121 f"Mismatch: pdfplumber() thinks there are {len(pdf.pages)} pages and"
122 f" pdf2image thinks there are {len(images)}"
123 )
124
125 words_by_page = []
126 boxes_by_page = []
127 dimensions_by_page = []
128 for i, page in enumerate(pdf.pages):
129 extracted_words = page.extract_words() if self.use_embedded_text else []
130
131 if len(extracted_words) == 0:
132 words, boxes = self.ocr_reader.apply_ocr(images[i])
133 words_by_page.append(words)
134 boxes_by_page.append(boxes)
135 dimensions_by_page.append((images[i].width, images[i].height))
136
137 else:
138 words = [w["text"] for w in extracted_words]
139 boxes = [[w["x0"], w["top"], w["x1"], w["bottom"]] for w in extracted_words]
140 words_by_page.append(words)
141 boxes_by_page.append(boxes)
142 dimensions_by_page.append((page.width, page.height))
143
144 return self._generate_document_output(images, words_by_page, boxes_by_page, dimensions_by_page)
145
146 @cached_property
147 def preview(self) -> "Image":

Callers

nothing calls this directly

Calls 2

apply_ocrMethod · 0.45

Tested by

no test coverage detected