hub / github.com/docling-project/docling / convert

Method convert

docling/backend/html_backend.py:481–555 · view source on GitHub ↗

(self)

Source from the content-addressed store, hash-verified

479
480	@override
481	def convert(self) -> DoclingDocument:
482	_log.debug("Starting HTML conversion...")
483	if not self.is_valid():
484	raise RuntimeError("Invalid HTML document.")
485
486	origin = DocumentOrigin(
487	filename=self.file.name or "file",
488	mimetype="text/html",
489	binary_hash=self.document_hash,
490	)
491	doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
492
493	if cast(HTMLBackendOptions, self.options).render_page:
494	self._render_with_browser()
495	if self._rendered_html:
496	self.soup = BeautifulSoup(self._rendered_html, "html.parser")
497
498	if self._rendered_page_images and self._rendered_page_size:
499	render_dpi = cast(HTMLBackendOptions, self.options).render_dpi
500	for page_no, page_image in enumerate(self._rendered_page_images, start=1):
501	doc.add_page(
502	page_no=page_no,
503	size=self._rendered_page_size,
504	image=ImageRef.from_pil(image=page_image, dpi=render_dpi),
505	)
506
507	assert self.soup is not None
508	# set the title as furniture, since it is part of the document metadata
509	title = self.soup.title
510	if title and self.options.add_title:
511	title_text = title.get_text(separator=" ", strip=True)
512	title_clean = HTMLDocumentBackend._clean_unicode(title_text)
513	doc.add_title(
514	text=title_clean,
515	orig=title_text,
516	content_layer=ContentLayer.FURNITURE,
517	)
518	# remove script and style tags
519	for tag in self.soup(["script", "noscript", "style"]):
520	tag.decompose()
521	# remove any hidden tag
522	for tag in self.soup(hidden=True):
523	tag.decompose()
524	# fix flow content that is not permitted inside <p>
525	HTMLDocumentBackend._fix_invalid_paragraph_structure(self.soup)
526
527	content = self.soup.body or self.soup
528
529	# normalize <br> tags - use sentinel to distinguish from source newlines
530	for text_node in content.find_all(string=True):
531	if _BR_SENTINEL in text_node:
532	text_node.replace_with(text_node.replace(_BR_SENTINEL, ""))
533	for br in content("br"):
534	br.replace_with(NavigableString(_BR_SENTINEL))
535
536	# Furniture before the first heading rule, except for headers in tables
537	header = None
538	# Find all headers first

Callers 13

convertMethod · 0.95

_convert_html_partMethod · 0.95

convertMethod · 0.95

test_heading_levelsFunction · 0.95

test_ordered_listsFunction · 0.95

test_description_listsFunction · 0.95

test_unicode_charactersFunction · 0.95

test_html_furnitureFunction · 0.95

test_table_row_section_flag_from_tr_and_td_classFunction · 0.95

test_e2e_rich_table_oom_regressionFunction · 0.95

test_nested_table_images_no_quadratic_picturesFunction · 0.95

test_anchor_fragment_links_with_source_uriFunction · 0.95

Calls 8

is_validMethod · 0.95

_render_with_browserMethod · 0.95

_walkMethod · 0.95

_ContextClass · 0.85

get_textMethod · 0.80

_clean_unicodeMethod · 0.80

_fix_invalid_paragraph_structureMethod · 0.80

replaceMethod · 0.80

Tested by 9

test_heading_levelsFunction · 0.76

test_ordered_listsFunction · 0.76

test_description_listsFunction · 0.76

test_unicode_charactersFunction · 0.76

test_html_furnitureFunction · 0.76

test_table_row_section_flag_from_tr_and_td_classFunction · 0.76

test_e2e_rich_table_oom_regressionFunction · 0.76

test_nested_table_images_no_quadratic_picturesFunction · 0.76

test_anchor_fragment_links_with_source_uriFunction · 0.76