Function load_document

src/docquery/document.py:238–267 · view source on GitHub ↗

(fpath: str, ocr_reader: Optional[Union[str, OCRReader]] = None, use_embedded_text=True)

Source from the content-addressed store, hash-verified

236
237	@validate_arguments
238	def load_document(fpath: str, ocr_reader: Optional[Union[str, OCRReader]] = None, use_embedded_text=True):
239	base_path = os.path.basename(fpath).split("?")[0].strip()
240	doc_type = mimetypes.guess_type(base_path)[0]
241	if fpath.startswith("http://") or fpath.startswith("https://"):
242	resp = requests.get(fpath, allow_redirects=True, stream=True)
243	if not resp.ok:
244	raise UnsupportedDocument(f"Failed to download: {resp.content}")
245
246	if "Content-Type" in resp.headers:
247	doc_type = resp.headers["Content-Type"].split(";")[0].strip()
248
249	b = resp.raw
250	else:
251	b = open(fpath, "rb")
252
253	if not ocr_reader or isinstance(ocr_reader, str):
254	ocr_reader = get_ocr_reader(ocr_reader)
255	elif not isinstance(ocr_reader, OCRReader):
256	raise NoOCRReaderFound(f"{ocr_reader} is not a supported OCRReader class")
257
258	if doc_type == "application/pdf":
259	return PDFDocument(b.read(), ocr_reader=ocr_reader, use_embedded_text=use_embedded_text)
260	elif doc_type == "text/html":
261	return WebDocument(fpath)
262	else:
263	try:
264	img = Image.open(b)
265	except UnidentifiedImageError as e:
266	raise UnsupportedDocument(e)
267	return ImageDocument(img, ocr_reader=ocr_reader)

test_impira_datasetFunction · 0.90

test_run_with_choosen_OCR_strFunction · 0.90

test_run_with_choosen_OCR_instanceFunction · 0.90

test_run_with_ignore_embedded_textFunction · 0.90

test_impira_datasetFunction · 0.90

mainFunction · 0.85

UnsupportedDocumentClass · 0.85

get_ocr_readerFunction · 0.85

NoOCRReaderFoundClass · 0.85

PDFDocumentClass · 0.85

WebDocumentClass · 0.85

ImageDocumentClass · 0.85

getMethod · 0.80

test_impira_datasetFunction · 0.72

test_run_with_choosen_OCR_strFunction · 0.72

test_run_with_choosen_OCR_instanceFunction · 0.72

test_run_with_ignore_embedded_textFunction · 0.72

test_impira_datasetFunction · 0.72