FetchURLContent fetches a URL and returns its text content (HTML converted to clean text). Includes SSRF validation, DNS pinning, browser-like headers, and content size limits.
(ctx context.Context, rawURL string)
| 26 | // FetchURLContent fetches a URL and returns its text content (HTML converted to clean text). |
| 27 | // Includes SSRF validation, DNS pinning, browser-like headers, and content size limits. |
| 28 | func FetchURLContent(ctx context.Context, rawURL string) (string, error) { |
| 29 | if rawURL == "" { |
| 30 | return "", fmt.Errorf("url is empty") |
| 31 | } |
| 32 | |
| 33 | // SSRF validation |
| 34 | if err := utils.ValidateURLForSSRF(rawURL); err != nil { |
| 35 | return "", fmt.Errorf("URL rejected: %w", err) |
| 36 | } |
| 37 | |
| 38 | u, err := url.Parse(rawURL) |
| 39 | if err != nil { |
| 40 | return "", fmt.Errorf("invalid URL: %w", err) |
| 41 | } |
| 42 | hostname := u.Hostname() |
| 43 | port := u.Port() |
| 44 | if port == "" { |
| 45 | if u.Scheme == "https" { |
| 46 | port = "443" |
| 47 | } else { |
| 48 | port = "80" |
| 49 | } |
| 50 | } |
| 51 | |
| 52 | // DNS pinning: resolve once, use pinned IP |
| 53 | ips, err := net.DefaultResolver.LookupIP(context.Background(), "ip", hostname) |
| 54 | if err != nil || len(ips) == 0 { |
| 55 | return "", fmt.Errorf("DNS lookup failed for %s: %w", hostname, err) |
| 56 | } |
| 57 | var pinnedIP net.IP |
| 58 | for _, ip := range ips { |
| 59 | if utils.IsPublicIP(ip) { |
| 60 | pinnedIP = ip |
| 61 | break |
| 62 | } |
| 63 | } |
| 64 | if pinnedIP == nil { |
| 65 | return "", fmt.Errorf("no public IP for host %s", hostname) |
| 66 | } |
| 67 | |
| 68 | // Build request with pinned IP |
| 69 | hostPort := net.JoinHostPort(pinnedIP.String(), port) |
| 70 | fetchURL := *u |
| 71 | fetchURL.Host = hostPort |
| 72 | |
| 73 | ctx, cancel := context.WithTimeout(ctx, fetchTimeout) |
| 74 | defer cancel() |
| 75 | |
| 76 | req, err := http.NewRequestWithContext(ctx, http.MethodGet, fetchURL.String(), nil) |
| 77 | if err != nil { |
| 78 | return "", err |
| 79 | } |
| 80 | req.Host = hostname |
| 81 | |
| 82 | // Browser-like headers to reduce 403 rejections. |
| 83 | // These match a real Chrome browser fingerprint. |
| 84 | req.Header.Set("User-Agent", |
| 85 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36") |
no test coverage detected