htmlToText extracts clean text from HTML, removing scripts/styles/nav.
(html string)
| 131 | |
| 132 | // htmlToText extracts clean text from HTML, removing scripts/styles/nav. |
| 133 | func htmlToText(html string) string { |
| 134 | doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) |
| 135 | if err != nil { |
| 136 | return stripTags(html) |
| 137 | } |
| 138 | doc.Find("script, style, nav, footer, header, iframe, noscript, svg, img").Remove() |
| 139 | |
| 140 | var sb strings.Builder |
| 141 | doc.Find("body").Each(func(i int, s *goquery.Selection) { |
| 142 | sb.WriteString(s.Text()) |
| 143 | }) |
| 144 | text := sb.String() |
| 145 | |
| 146 | // Normalize whitespace: collapse blank lines |
| 147 | lines := strings.Split(text, "\n") |
| 148 | var cleaned []string |
| 149 | for _, line := range lines { |
| 150 | line = strings.TrimSpace(line) |
| 151 | if line != "" { |
| 152 | cleaned = append(cleaned, line) |
| 153 | } |
| 154 | } |
| 155 | return strings.Join(cleaned, "\n") |
| 156 | } |
| 157 | |
| 158 | func stripTags(s string) string { |
| 159 | var sb strings.Builder |
no test coverage detected