| 236 | |
| 237 | @validate_arguments |
| 238 | def load_document(fpath: str, ocr_reader: Optional[Union[str, OCRReader]] = None, use_embedded_text=True): |
| 239 | base_path = os.path.basename(fpath).split("?")[0].strip() |
| 240 | doc_type = mimetypes.guess_type(base_path)[0] |
| 241 | if fpath.startswith("http://") or fpath.startswith("https://"): |
| 242 | resp = requests.get(fpath, allow_redirects=True, stream=True) |
| 243 | if not resp.ok: |
| 244 | raise UnsupportedDocument(f"Failed to download: {resp.content}") |
| 245 | |
| 246 | if "Content-Type" in resp.headers: |
| 247 | doc_type = resp.headers["Content-Type"].split(";")[0].strip() |
| 248 | |
| 249 | b = resp.raw |
| 250 | else: |
| 251 | b = open(fpath, "rb") |
| 252 | |
| 253 | if not ocr_reader or isinstance(ocr_reader, str): |
| 254 | ocr_reader = get_ocr_reader(ocr_reader) |
| 255 | elif not isinstance(ocr_reader, OCRReader): |
| 256 | raise NoOCRReaderFound(f"{ocr_reader} is not a supported OCRReader class") |
| 257 | |
| 258 | if doc_type == "application/pdf": |
| 259 | return PDFDocument(b.read(), ocr_reader=ocr_reader, use_embedded_text=use_embedded_text) |
| 260 | elif doc_type == "text/html": |
| 261 | return WebDocument(fpath) |
| 262 | else: |
| 263 | try: |
| 264 | img = Image.open(b) |
| 265 | except UnidentifiedImageError as e: |
| 266 | raise UnsupportedDocument(e) |
| 267 | return ImageDocument(img, ocr_reader=ocr_reader) |