MCPcopy
hub / github.com/Tencent/WeKnora / FetchURLContent

Function FetchURLContent

internal/infrastructure/web_fetch/fetcher.go:28–130  ·  view source on GitHub ↗

FetchURLContent fetches a URL and returns its text content (HTML converted to clean text). Includes SSRF validation, DNS pinning, browser-like headers, and content size limits.

(ctx context.Context, rawURL string)

Source from the content-addressed store, hash-verified

26// FetchURLContent fetches a URL and returns its text content (HTML converted to clean text).
27// Includes SSRF validation, DNS pinning, browser-like headers, and content size limits.
28func FetchURLContent(ctx context.Context, rawURL string) (string, error) {
29 if rawURL == "" {
30 return "", fmt.Errorf("url is empty")
31 }
32
33 // SSRF validation
34 if err := utils.ValidateURLForSSRF(rawURL); err != nil {
35 return "", fmt.Errorf("URL rejected: %w", err)
36 }
37
38 u, err := url.Parse(rawURL)
39 if err != nil {
40 return "", fmt.Errorf("invalid URL: %w", err)
41 }
42 hostname := u.Hostname()
43 port := u.Port()
44 if port == "" {
45 if u.Scheme == "https" {
46 port = "443"
47 } else {
48 port = "80"
49 }
50 }
51
52 // DNS pinning: resolve once, use pinned IP
53 ips, err := net.DefaultResolver.LookupIP(context.Background(), "ip", hostname)
54 if err != nil || len(ips) == 0 {
55 return "", fmt.Errorf("DNS lookup failed for %s: %w", hostname, err)
56 }
57 var pinnedIP net.IP
58 for _, ip := range ips {
59 if utils.IsPublicIP(ip) {
60 pinnedIP = ip
61 break
62 }
63 }
64 if pinnedIP == nil {
65 return "", fmt.Errorf("no public IP for host %s", hostname)
66 }
67
68 // Build request with pinned IP
69 hostPort := net.JoinHostPort(pinnedIP.String(), port)
70 fetchURL := *u
71 fetchURL.Host = hostPort
72
73 ctx, cancel := context.WithTimeout(ctx, fetchTimeout)
74 defer cancel()
75
76 req, err := http.NewRequestWithContext(ctx, http.MethodGet, fetchURL.String(), nil)
77 if err != nil {
78 return "", err
79 }
80 req.Host = hostname
81
82 // Browser-like headers to reduce 403 rejections.
83 // These match a real Chrome browser fingerprint.
84 req.Header.Set("User-Agent",
85 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")

Callers 1

OnEventMethod · 0.92

Calls 8

ValidateURLForSSRFFunction · 0.92
IsPublicIPFunction · 0.92
InfofFunction · 0.92
htmlToTextFunction · 0.85
ParseMethod · 0.80
SetMethod · 0.65
StringMethod · 0.45
CloseMethod · 0.45

Tested by

no test coverage detected