MCPcopy Index your code
hub / github.com/unclecode/crawl4ai / HTML2Text

Class HTML2Text

crawl4ai/html2text/__init__.py:37–1007  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

35
36
37class HTML2Text(html.parser.HTMLParser):
38 def __init__(
39 self,
40 out: Optional[OutCallback] = None,
41 baseurl: str = "",
42 bodywidth: int = config.BODY_WIDTH,
43 ) -> None:
44 """
45 Input parameters:
46 out: possible custom replacement for self.outtextf (which
47 appends lines of text).
48 baseurl: base URL of the document we process
49 """
50 super().__init__(convert_charrefs=False)
51
52 # Config options
53 self.split_next_td = False
54 self.td_count = 0
55 self.table_start = False
56 self.unicode_snob = config.UNICODE_SNOB # covered in cli
57
58 self.escape_snob = config.ESCAPE_SNOB # covered in cli
59 self.escape_backslash = config.ESCAPE_BACKSLASH # covered in cli
60 self.escape_dot = config.ESCAPE_DOT # covered in cli
61 self.escape_plus = config.ESCAPE_PLUS # covered in cli
62 self.escape_dash = config.ESCAPE_DASH # covered in cli
63
64 self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
65 self.body_width = bodywidth # covered in cli
66 self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli
67 self.inline_links = config.INLINE_LINKS # covered in cli
68 self.protect_links = config.PROTECT_LINKS # covered in cli
69 self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli
70 self.ignore_links = config.IGNORE_ANCHORS # covered in cli
71 self.ignore_mailto_links = config.IGNORE_MAILTO_LINKS # covered in cli
72 self.ignore_images = config.IGNORE_IMAGES # covered in cli
73 self.images_as_html = config.IMAGES_AS_HTML # covered in cli
74 self.images_to_alt = config.IMAGES_TO_ALT # covered in cli
75 self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli
76 self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli
77 self.bypass_tables = config.BYPASS_TABLES # covered in cli
78 self.ignore_tables = config.IGNORE_TABLES # covered in cli
79 self.google_doc = False # covered in cli
80 self.ul_item_mark = "*" # covered in cli
81 self.emphasis_mark = "_" # covered in cli
82 self.strong_mark = "**"
83 self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli
84 self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli
85 self.hide_strikethrough = False # covered in cli
86 self.mark_code = config.MARK_CODE
87 self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli
88 self.wrap_links = config.WRAP_LINKS # covered in cli
89 self.wrap_tables = config.WRAP_TABLES
90 self.pad_tables = config.PAD_TABLES # covered in cli
91 self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
92 self.tag_callback = None
93 self.open_quote = config.OPEN_QUOTE # covered in cli
94 self.close_quote = config.CLOSE_QUOTE # covered in cli

Callers 2

mainFunction · 0.85
html2textFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…