Function htmlToText

internal/infrastructure/web_fetch/fetcher.go:133–156 · view source on GitHub ↗

htmlToText extracts clean text from HTML, removing scripts/styles/nav.

(html string)

Source from the content-addressed store, hash-verified

131
132	// htmlToText extracts clean text from HTML, removing scripts/styles/nav.
133	func htmlToText(html string) string {
134	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
135	if err != nil {
136	return stripTags(html)
137	}
138	doc.Find("script, style, nav, footer, header, iframe, noscript, svg, img").Remove()
139
140	var sb strings.Builder
141	doc.Find("body").Each(func(i int, s *goquery.Selection) {
142	sb.WriteString(s.Text())
143	})
144	text := sb.String()
145
146	// Normalize whitespace: collapse blank lines
147	lines := strings.Split(text, "\n")
148	var cleaned []string
149	for _, line := range lines {
150	line = strings.TrimSpace(line)
151	if line != "" {
152	cleaned = append(cleaned, line)
153	}
154	}
155	return strings.Join(cleaned, "\n")
156	}
157
158	func stripTags(s string) string {
159	var sb strings.Builder

FetchURLContentFunction · 0.85

stripTagsFunction · 0.85

RemoveMethod · 0.65

StringMethod · 0.45

no test coverage detected