(self)
| 110 | |
| 111 | @cached_property |
| 112 | def context(self) -> Dict[str, List[Tuple["Image.Image", List[Any]]]]: |
| 113 | pdf = self._pdf |
| 114 | if pdf is None: |
| 115 | return {} |
| 116 | |
| 117 | images = self._images |
| 118 | |
| 119 | if len(images) != len(pdf.pages): |
| 120 | raise ValueError( |
| 121 | f"Mismatch: pdfplumber() thinks there are {len(pdf.pages)} pages and" |
| 122 | f" pdf2image thinks there are {len(images)}" |
| 123 | ) |
| 124 | |
| 125 | words_by_page = [] |
| 126 | boxes_by_page = [] |
| 127 | dimensions_by_page = [] |
| 128 | for i, page in enumerate(pdf.pages): |
| 129 | extracted_words = page.extract_words() if self.use_embedded_text else [] |
| 130 | |
| 131 | if len(extracted_words) == 0: |
| 132 | words, boxes = self.ocr_reader.apply_ocr(images[i]) |
| 133 | words_by_page.append(words) |
| 134 | boxes_by_page.append(boxes) |
| 135 | dimensions_by_page.append((images[i].width, images[i].height)) |
| 136 | |
| 137 | else: |
| 138 | words = [w["text"] for w in extracted_words] |
| 139 | boxes = [[w["x0"], w["top"], w["x1"], w["bottom"]] for w in extracted_words] |
| 140 | words_by_page.append(words) |
| 141 | boxes_by_page.append(boxes) |
| 142 | dimensions_by_page.append((page.width, page.height)) |
| 143 | |
| 144 | return self._generate_document_output(images, words_by_page, boxes_by_page, dimensions_by_page) |
| 145 | |
| 146 | @cached_property |
| 147 | def preview(self) -> "Image": |
nothing calls this directly
no test coverage detected