Method handle_data

crawl4ai/html2text/__init__.py:867–908 · view source on GitHub ↗

(self, data: str, entity_char: bool = False)

Source from the content-addressed store, hash-verified

865	self.outcount += 1
866
867	def handle_data(self, data: str, entity_char: bool = False) -> None:
868	if not data:
869	# Data may be empty for some HTML entities. For example,
870	# LEFT-TO-RIGHT MARK.
871	return
872
873	if self.stressed:
874	data = data.strip()
875	self.stressed = False
876	self.preceding_stressed = True
877	elif self.preceding_stressed:
878	if (
879	re.match(r"[^][(){}\s.!?]", data[0])
880	and not hn(self.current_tag)
881	and self.current_tag not in ["a", "code", "pre"]
882	):
883	# should match a letter or common punctuation
884	data = " " + data
885	self.preceding_stressed = False
886
887	if self.style:
888	self.style_def.update(dumb_css_parser(data))
889
890	if self.maybe_automatic_link is not None:
891	href = self.maybe_automatic_link
892	if (
893	href == data
894	and self.absolute_url_matcher.match(href)
895	and self.use_automatic_links
896	):
897	self.o("<" + data + ">")
898	self.empty_link = False
899	return
900	else:
901	self.o("[")
902	self.maybe_automatic_link = None
903	self.empty_link = False
904
905	if not self.code and not self.pre and not entity_char:
906	data = escape_md_section(data, snob=self.escape_snob, escape_dot=self.escape_dot, escape_plus=self.escape_plus, escape_dash=self.escape_dash)
907	self.preceding_data = data
908	self.o(data, puredata=True)
909
910	def charref(self, name: str) -> str:
911	if name[0] in ["x", "X"]:

handle_charrefMethod · 0.95

handle_entityrefMethod · 0.95

oMethod · 0.95

hnFunction · 0.85

dumb_css_parserFunction · 0.85

escape_md_sectionFunction · 0.85

no test coverage detected