MCPcopy Index your code
hub / github.com/impira/docquery / PDFDocument

Class PDFDocument

src/docquery/document.py:103–164  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

101
102
103class PDFDocument(Document):
104 def __init__(self, b, ocr_reader, use_embedded_text, **kwargs):
105 self.b = b
106 self.ocr_reader = ocr_reader
107 self.use_embedded_text = use_embedded_text
108
109 super().__init__(**kwargs)
110
111 @cached_property
112 def context(self) -> Dict[str, List[Tuple["Image.Image", List[Any]]]]:
113 pdf = self._pdf
114 if pdf is None:
115 return {}
116
117 images = self._images
118
119 if len(images) != len(pdf.pages):
120 raise ValueError(
121 f"Mismatch: pdfplumber() thinks there are {len(pdf.pages)} pages and"
122 f" pdf2image thinks there are {len(images)}"
123 )
124
125 words_by_page = []
126 boxes_by_page = []
127 dimensions_by_page = []
128 for i, page in enumerate(pdf.pages):
129 extracted_words = page.extract_words() if self.use_embedded_text else []
130
131 if len(extracted_words) == 0:
132 words, boxes = self.ocr_reader.apply_ocr(images[i])
133 words_by_page.append(words)
134 boxes_by_page.append(boxes)
135 dimensions_by_page.append((images[i].width, images[i].height))
136
137 else:
138 words = [w["text"] for w in extracted_words]
139 boxes = [[w["x0"], w["top"], w["x1"], w["bottom"]] for w in extracted_words]
140 words_by_page.append(words)
141 boxes_by_page.append(boxes)
142 dimensions_by_page.append((page.width, page.height))
143
144 return self._generate_document_output(images, words_by_page, boxes_by_page, dimensions_by_page)
145
146 @cached_property
147 def preview(self) -> "Image":
148 return self._images
149
150 @cached_property
151 def _images(self):
152 # First, try to extract text directly
153 # TODO: This library requires poppler, which is not present everywhere.
154 # We should look into alternatives. We could also gracefully handle this
155 # and simply fall back to _only_ extracted text
156 return [x.convert("RGB") for x in pdf2image.convert_from_bytes(self.b)]
157
158 @cached_property
159 def _pdf(self):
160 use_pdf_plumber()

Callers 1

load_documentFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected