MCPcopy
hub / github.com/docling-project/docling / convert

Method convert

docling/backend/html_backend.py:481–555  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

479
480 @override
481 def convert(self) -> DoclingDocument:
482 _log.debug("Starting HTML conversion...")
483 if not self.is_valid():
484 raise RuntimeError("Invalid HTML document.")
485
486 origin = DocumentOrigin(
487 filename=self.file.name or "file",
488 mimetype="text/html",
489 binary_hash=self.document_hash,
490 )
491 doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
492
493 if cast(HTMLBackendOptions, self.options).render_page:
494 self._render_with_browser()
495 if self._rendered_html:
496 self.soup = BeautifulSoup(self._rendered_html, "html.parser")
497
498 if self._rendered_page_images and self._rendered_page_size:
499 render_dpi = cast(HTMLBackendOptions, self.options).render_dpi
500 for page_no, page_image in enumerate(self._rendered_page_images, start=1):
501 doc.add_page(
502 page_no=page_no,
503 size=self._rendered_page_size,
504 image=ImageRef.from_pil(image=page_image, dpi=render_dpi),
505 )
506
507 assert self.soup is not None
508 # set the title as furniture, since it is part of the document metadata
509 title = self.soup.title
510 if title and self.options.add_title:
511 title_text = title.get_text(separator=" ", strip=True)
512 title_clean = HTMLDocumentBackend._clean_unicode(title_text)
513 doc.add_title(
514 text=title_clean,
515 orig=title_text,
516 content_layer=ContentLayer.FURNITURE,
517 )
518 # remove script and style tags
519 for tag in self.soup(["script", "noscript", "style"]):
520 tag.decompose()
521 # remove any hidden tag
522 for tag in self.soup(hidden=True):
523 tag.decompose()
524 # fix flow content that is not permitted inside <p>
525 HTMLDocumentBackend._fix_invalid_paragraph_structure(self.soup)
526
527 content = self.soup.body or self.soup
528
529 # normalize <br> tags - use sentinel to distinguish from source newlines
530 for text_node in content.find_all(string=True):
531 if _BR_SENTINEL in text_node:
532 text_node.replace_with(text_node.replace(_BR_SENTINEL, ""))
533 for br in content("br"):
534 br.replace_with(NavigableString(_BR_SENTINEL))
535
536 # Furniture before the first heading rule, except for headers in tables
537 header = None
538 # Find all headers first

Calls 8

is_validMethod · 0.95
_render_with_browserMethod · 0.95
_walkMethod · 0.95
_ContextClass · 0.85
get_textMethod · 0.80
_clean_unicodeMethod · 0.80
replaceMethod · 0.80