MCPcopy
hub / github.com/unclecode/crawl4ai / CustomHTML2Text

Class CustomHTML2Text

crawl4ai/utils.py:181–223  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

179 return s
180
181class CustomHTML2Text(HTML2Text):
182 def __init__(self, *args, **kwargs):
183 super().__init__(*args, **kwargs)
184 self.inside_pre = False
185 self.inside_code = False
186
187 self.skip_internal_links = False
188 self.single_line_break = False
189 self.mark_code = False
190 self.include_sup_sub = False
191 self.body_width = 0
192 self.ignore_mailto_links = True
193 self.ignore_links = False
194 self.escape_backslash = False
195 self.escape_dot = False
196 self.escape_plus = False
197 self.escape_dash = False
198 self.escape_snob = False
199
200
201 def handle_tag(self, tag, attrs, start):
202 if tag == 'pre':
203 if start:
204 self.o('```\n')
205 self.inside_pre = True
206 else:
207 self.o('\n```')
208 self.inside_pre = False
209 elif tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
210 pass
211
212
213 # elif tag == 'code' and not self.inside_pre:
214 # if start:
215 # if not self.inside_pre:
216 # self.o('`')
217 # self.inside_code = True
218 # else:
219 # if not self.inside_pre:
220 # self.o('`')
221 # self.inside_code = False
222
223 super().handle_tag(tag, attrs, start)
224
225def replace_inline_tags(soup, tags, only_text=False):
226 tag_replacements = {

Calls

no outgoing calls

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…