| 395 | |
| 396 | |
| 397 | class HTMLDocumentBackend(DeclarativeDocumentBackend): |
| 398 | @override |
| 399 | def __init__( |
| 400 | self, |
| 401 | in_doc: InputDocument, |
| 402 | path_or_stream: Union[BytesIO, Path], |
| 403 | options: Optional[HTMLBackendOptions] = None, |
| 404 | ): |
| 405 | if options is None: |
| 406 | options = HTMLBackendOptions() |
| 407 | super().__init__(in_doc, path_or_stream, options) |
| 408 | self.options: HTMLBackendOptions |
| 409 | self.soup: Optional[BeautifulSoup] = None |
| 410 | self.path_or_stream: Union[BytesIO, Path] = path_or_stream |
| 411 | self.base_path: Optional[str] = ( |
| 412 | str(options.source_uri) if options.source_uri is not None else None |
| 413 | ) |
| 414 | self._image_loader = ImageResourceLoader( |
| 415 | enable_local_fetch=options.enable_local_fetch, |
| 416 | enable_remote_fetch=options.enable_remote_fetch, |
| 417 | max_image_data_base64_bytes=options.max_image_data_base64_bytes, |
| 418 | max_remote_image_bytes=options.max_remote_image_bytes, |
| 419 | max_redirects=options.max_redirects, |
| 420 | headers=options.headers, |
| 421 | ) |
| 422 | |
| 423 | # Initialize the parents for the hierarchy |
| 424 | self.max_levels = 10 |
| 425 | self.level = 0 |
| 426 | self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {} |
| 427 | self.ctx = _Context() |
| 428 | self._disable_inline_group_depth: int = 0 |
| 429 | for i in range(self.max_levels): |
| 430 | self.parents[i] = None |
| 431 | self.hyperlink: Union[AnyUrl, Path, None] = None |
| 432 | self.format_tags: list[str] = [] |
| 433 | self._raw_html_bytes: Optional[bytes] = None |
| 434 | self._rendered_html: Optional[str] = None |
| 435 | self._rendered_bbox_by_id: dict[str, _RenderedBBox] = {} |
| 436 | self._rendered_text_bbox_by_id: dict[str, _RenderedBBox] = {} |
| 437 | self._rendered_page_images: list[Image.Image] = [] |
| 438 | self._rendered_page_size: Optional[Size] = None |
| 439 | self._suppressed_tag_ids_stack: list[set[str]] = [] |
| 440 | self._suppressed_tag_obj_ids_stack: list[set[int]] = [] |
| 441 | self._form_fields_by_key_id_stack: list[dict[str, _ExtractedFormField]] = [] |
| 442 | self._tag_name_by_docling_id_cache: dict[str, str] = {} |
| 443 | self._generated_html_id_counter: int = 0 |
| 444 | self._render_visibility_cache: dict[int, bool] = {} |
| 445 | |
| 446 | try: |
| 447 | raw = ( |
| 448 | path_or_stream.getvalue() |
| 449 | if isinstance(path_or_stream, BytesIO) |
| 450 | else Path(path_or_stream).read_bytes() |
| 451 | ) |
| 452 | self._raw_html_bytes = raw |
| 453 | self.soup = BeautifulSoup(raw, "html.parser") |
| 454 | except Exception as e: |
no outgoing calls
searching dependent graphs…