(ctx context.Context, client *http.Client, urlStr, format string, headers map[string]string, robotsCache map[string]*robotstxt.RobotsData)
| 179 | } |
| 180 | |
| 181 | func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr, format string, headers map[string]string, robotsCache map[string]*robotstxt.RobotsData) Result { |
| 182 | result := Result{URL: urlStr} |
| 183 | |
| 184 | // Validate URL |
| 185 | parsedURL, err := url.Parse(urlStr) |
| 186 | if err != nil { |
| 187 | result.Error = fmt.Sprintf("invalid URL: %v", err) |
| 188 | return result |
| 189 | } |
| 190 | |
| 191 | // Check for valid URL structure |
| 192 | if parsedURL.Scheme == "" || parsedURL.Host == "" { |
| 193 | result.Error = "invalid URL: missing scheme or host" |
| 194 | return result |
| 195 | } |
| 196 | |
| 197 | // Only allow HTTP and HTTPS |
| 198 | if parsedURL.Scheme != "http" && parsedURL.Scheme != "https" { |
| 199 | result.Error = "only HTTP and HTTPS URLs are supported" |
| 200 | return result |
| 201 | } |
| 202 | |
| 203 | // Enforce domain allow/deny lists configured on the toolset. |
| 204 | if err := h.checkDomainAllowed(parsedURL); err != nil { |
| 205 | result.Error = err.Error() |
| 206 | return result |
| 207 | } |
| 208 | |
| 209 | // Check robots.txt (with caching per host) |
| 210 | host := parsedURL.Host |
| 211 | robots, cached := robotsCache[host] |
| 212 | if !cached { |
| 213 | var err error |
| 214 | robots, err = h.fetchRobots(ctx, client, parsedURL, headers) |
| 215 | if err != nil { |
| 216 | result.Error = fmt.Sprintf("robots.txt check failed: %v", err) |
| 217 | return result |
| 218 | } |
| 219 | robotsCache[host] = robots |
| 220 | } |
| 221 | |
| 222 | if robots != nil && !robots.TestAgent(parsedURL.Path, useragent.Header) { |
| 223 | result.Error = "URL blocked by robots.txt" |
| 224 | return result |
| 225 | } |
| 226 | |
| 227 | fmtHandler := formatHandlerFor(format) |
| 228 | |
| 229 | req, err := http.NewRequestWithContext(ctx, http.MethodGet, urlStr, http.NoBody) |
| 230 | if err != nil { |
| 231 | result.Error = fmt.Sprintf("failed to create request: %v", err) |
| 232 | return result |
| 233 | } |
| 234 | req.Header.Set("Accept", fmtHandler.accept) |
| 235 | useragent.SetIdentity(req) |
| 236 | // Apply caller-configured headers last so an operator-supplied |
| 237 | // Authorization, User-Agent, Accept, ... wins over the defaults set above. |
| 238 | for k, v := range headers { |
no test coverage detected