| 479 | |
| 480 | @override |
| 481 | def convert(self) -> DoclingDocument: |
| 482 | _log.debug("Starting HTML conversion...") |
| 483 | if not self.is_valid(): |
| 484 | raise RuntimeError("Invalid HTML document.") |
| 485 | |
| 486 | origin = DocumentOrigin( |
| 487 | filename=self.file.name or "file", |
| 488 | mimetype="text/html", |
| 489 | binary_hash=self.document_hash, |
| 490 | ) |
| 491 | doc = DoclingDocument(name=self.file.stem or "file", origin=origin) |
| 492 | |
| 493 | if cast(HTMLBackendOptions, self.options).render_page: |
| 494 | self._render_with_browser() |
| 495 | if self._rendered_html: |
| 496 | self.soup = BeautifulSoup(self._rendered_html, "html.parser") |
| 497 | |
| 498 | if self._rendered_page_images and self._rendered_page_size: |
| 499 | render_dpi = cast(HTMLBackendOptions, self.options).render_dpi |
| 500 | for page_no, page_image in enumerate(self._rendered_page_images, start=1): |
| 501 | doc.add_page( |
| 502 | page_no=page_no, |
| 503 | size=self._rendered_page_size, |
| 504 | image=ImageRef.from_pil(image=page_image, dpi=render_dpi), |
| 505 | ) |
| 506 | |
| 507 | assert self.soup is not None |
| 508 | # set the title as furniture, since it is part of the document metadata |
| 509 | title = self.soup.title |
| 510 | if title and self.options.add_title: |
| 511 | title_text = title.get_text(separator=" ", strip=True) |
| 512 | title_clean = HTMLDocumentBackend._clean_unicode(title_text) |
| 513 | doc.add_title( |
| 514 | text=title_clean, |
| 515 | orig=title_text, |
| 516 | content_layer=ContentLayer.FURNITURE, |
| 517 | ) |
| 518 | # remove script and style tags |
| 519 | for tag in self.soup(["script", "noscript", "style"]): |
| 520 | tag.decompose() |
| 521 | # remove any hidden tag |
| 522 | for tag in self.soup(hidden=True): |
| 523 | tag.decompose() |
| 524 | # fix flow content that is not permitted inside <p> |
| 525 | HTMLDocumentBackend._fix_invalid_paragraph_structure(self.soup) |
| 526 | |
| 527 | content = self.soup.body or self.soup |
| 528 | |
| 529 | # normalize <br> tags - use sentinel to distinguish from source newlines |
| 530 | for text_node in content.find_all(string=True): |
| 531 | if _BR_SENTINEL in text_node: |
| 532 | text_node.replace_with(text_node.replace(_BR_SENTINEL, "")) |
| 533 | for br in content("br"): |
| 534 | br.replace_with(NavigableString(_BR_SENTINEL)) |
| 535 | |
| 536 | # Furniture before the first heading rule, except for headers in tables |
| 537 | header = None |
| 538 | # Find all headers first |