MCPcopy
hub / github.com/Tencent/WeKnora / htmlToText

Function htmlToText

internal/infrastructure/web_fetch/fetcher.go:133–156  ·  view source on GitHub ↗

htmlToText extracts clean text from HTML, removing scripts/styles/nav.

(html string)

Source from the content-addressed store, hash-verified

131
132// htmlToText extracts clean text from HTML, removing scripts/styles/nav.
133func htmlToText(html string) string {
134 doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
135 if err != nil {
136 return stripTags(html)
137 }
138 doc.Find("script, style, nav, footer, header, iframe, noscript, svg, img").Remove()
139
140 var sb strings.Builder
141 doc.Find("body").Each(func(i int, s *goquery.Selection) {
142 sb.WriteString(s.Text())
143 })
144 text := sb.String()
145
146 // Normalize whitespace: collapse blank lines
147 lines := strings.Split(text, "\n")
148 var cleaned []string
149 for _, line := range lines {
150 line = strings.TrimSpace(line)
151 if line != "" {
152 cleaned = append(cleaned, line)
153 }
154 }
155 return strings.Join(cleaned, "\n")
156}
157
158func stripTags(s string) string {
159 var sb strings.Builder

Callers 1

FetchURLContentFunction · 0.85

Calls 3

stripTagsFunction · 0.85
RemoveMethod · 0.65
StringMethod · 0.45

Tested by

no test coverage detected