| 101 | |
| 102 | |
| 103 | class PDFDocument(Document): |
| 104 | def __init__(self, b, ocr_reader, use_embedded_text, **kwargs): |
| 105 | self.b = b |
| 106 | self.ocr_reader = ocr_reader |
| 107 | self.use_embedded_text = use_embedded_text |
| 108 | |
| 109 | super().__init__(**kwargs) |
| 110 | |
| 111 | @cached_property |
| 112 | def context(self) -> Dict[str, List[Tuple["Image.Image", List[Any]]]]: |
| 113 | pdf = self._pdf |
| 114 | if pdf is None: |
| 115 | return {} |
| 116 | |
| 117 | images = self._images |
| 118 | |
| 119 | if len(images) != len(pdf.pages): |
| 120 | raise ValueError( |
| 121 | f"Mismatch: pdfplumber() thinks there are {len(pdf.pages)} pages and" |
| 122 | f" pdf2image thinks there are {len(images)}" |
| 123 | ) |
| 124 | |
| 125 | words_by_page = [] |
| 126 | boxes_by_page = [] |
| 127 | dimensions_by_page = [] |
| 128 | for i, page in enumerate(pdf.pages): |
| 129 | extracted_words = page.extract_words() if self.use_embedded_text else [] |
| 130 | |
| 131 | if len(extracted_words) == 0: |
| 132 | words, boxes = self.ocr_reader.apply_ocr(images[i]) |
| 133 | words_by_page.append(words) |
| 134 | boxes_by_page.append(boxes) |
| 135 | dimensions_by_page.append((images[i].width, images[i].height)) |
| 136 | |
| 137 | else: |
| 138 | words = [w["text"] for w in extracted_words] |
| 139 | boxes = [[w["x0"], w["top"], w["x1"], w["bottom"]] for w in extracted_words] |
| 140 | words_by_page.append(words) |
| 141 | boxes_by_page.append(boxes) |
| 142 | dimensions_by_page.append((page.width, page.height)) |
| 143 | |
| 144 | return self._generate_document_output(images, words_by_page, boxes_by_page, dimensions_by_page) |
| 145 | |
| 146 | @cached_property |
| 147 | def preview(self) -> "Image": |
| 148 | return self._images |
| 149 | |
| 150 | @cached_property |
| 151 | def _images(self): |
| 152 | # First, try to extract text directly |
| 153 | # TODO: This library requires poppler, which is not present everywhere. |
| 154 | # We should look into alternatives. We could also gracefully handle this |
| 155 | # and simply fall back to _only_ extracted text |
| 156 | return [x.convert("RGB") for x in pdf2image.convert_from_bytes(self.b)] |
| 157 | |
| 158 | @cached_property |
| 159 | def _pdf(self): |
| 160 | use_pdf_plumber() |