MCPcopy
hub / github.com/docling-project/docling / HTMLDocumentBackend

Class HTMLDocumentBackend

docling/backend/html_backend.py:397–4555  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

395
396
397class HTMLDocumentBackend(DeclarativeDocumentBackend):
398 @override
399 def __init__(
400 self,
401 in_doc: InputDocument,
402 path_or_stream: Union[BytesIO, Path],
403 options: Optional[HTMLBackendOptions] = None,
404 ):
405 if options is None:
406 options = HTMLBackendOptions()
407 super().__init__(in_doc, path_or_stream, options)
408 self.options: HTMLBackendOptions
409 self.soup: Optional[BeautifulSoup] = None
410 self.path_or_stream: Union[BytesIO, Path] = path_or_stream
411 self.base_path: Optional[str] = (
412 str(options.source_uri) if options.source_uri is not None else None
413 )
414 self._image_loader = ImageResourceLoader(
415 enable_local_fetch=options.enable_local_fetch,
416 enable_remote_fetch=options.enable_remote_fetch,
417 max_image_data_base64_bytes=options.max_image_data_base64_bytes,
418 max_remote_image_bytes=options.max_remote_image_bytes,
419 max_redirects=options.max_redirects,
420 headers=options.headers,
421 )
422
423 # Initialize the parents for the hierarchy
424 self.max_levels = 10
425 self.level = 0
426 self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
427 self.ctx = _Context()
428 self._disable_inline_group_depth: int = 0
429 for i in range(self.max_levels):
430 self.parents[i] = None
431 self.hyperlink: Union[AnyUrl, Path, None] = None
432 self.format_tags: list[str] = []
433 self._raw_html_bytes: Optional[bytes] = None
434 self._rendered_html: Optional[str] = None
435 self._rendered_bbox_by_id: dict[str, _RenderedBBox] = {}
436 self._rendered_text_bbox_by_id: dict[str, _RenderedBBox] = {}
437 self._rendered_page_images: list[Image.Image] = []
438 self._rendered_page_size: Optional[Size] = None
439 self._suppressed_tag_ids_stack: list[set[str]] = []
440 self._suppressed_tag_obj_ids_stack: list[set[int]] = []
441 self._form_fields_by_key_id_stack: list[dict[str, _ExtractedFormField]] = []
442 self._tag_name_by_docling_id_cache: dict[str, str] = {}
443 self._generated_html_id_counter: int = 0
444 self._render_visibility_cache: dict[int, bool] = {}
445
446 try:
447 raw = (
448 path_or_stream.getvalue()
449 if isinstance(path_or_stream, BytesIO)
450 else Path(path_or_stream).read_bytes()
451 )
452 self._raw_html_bytes = raw
453 self.soup = BeautifulSoup(raw, "html.parser")
454 except Exception as e:

Callers 15

convertMethod · 0.90
_convert_html_partMethod · 0.90
convertMethod · 0.90
test_heading_levelsFunction · 0.90
test_ordered_listsFunction · 0.90
test_description_listsFunction · 0.90
test_unicode_charactersFunction · 0.90
test_html_furnitureFunction · 0.90
test_is_rich_table_cellFunction · 0.90

Calls

no outgoing calls

Used in the wild real call sites across dependent graphs

searching dependent graphs…