fetchRobots fetches and parses robots.txt for the given URL's host. Returns nil (allow all) if robots.txt is missing or unreachable. Returns an error if the server returns a non-OK status or the content cannot be read/parsed. We deliberately reuse the caller-supplied client (rather than building a
(ctx context.Context, client *http.Client, targetURL *url.URL, headers map[string]string)
| 282 | // when crossing host boundaries, so credentials never leak to a |
| 283 | // third-party host that handles a robots.txt redirect. |
| 284 | func (h *fetchHandler) fetchRobots(ctx context.Context, client *http.Client, targetURL *url.URL, headers map[string]string) (*robotstxt.RobotsData, error) { |
| 285 | // Build robots.txt URL |
| 286 | robotsURL := &url.URL{ |
| 287 | Scheme: targetURL.Scheme, |
| 288 | Host: targetURL.Host, |
| 289 | Path: "/robots.txt", |
| 290 | } |
| 291 | |
| 292 | // Create request for robots.txt |
| 293 | req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL.String(), http.NoBody) |
| 294 | if err != nil { |
| 295 | // If we can't create request, allow the fetch |
| 296 | return nil, nil |
| 297 | } |
| 298 | |
| 299 | useragent.SetIdentity(req) |
| 300 | // Apply custom headers to robots.txt requests too, so authenticated |
| 301 | // endpoints that also protect robots.txt work correctly. Cross-host |
| 302 | // leaks are prevented by the shared client's CheckRedirect. |
| 303 | for k, v := range headers { |
| 304 | req.Header.Set(k, v) |
| 305 | } |
| 306 | |
| 307 | resp, err := client.Do(req) |
| 308 | if err != nil { |
| 309 | // If robots.txt is unreachable, allow the fetch. This also covers the |
| 310 | // case where CheckRedirect blocks a robots.txt redirect into a denied |
| 311 | // host: we treat it as "no robots.txt available" and proceed with the |
| 312 | // main fetch, which itself runs through the same allow/deny checks. |
| 313 | return nil, nil |
| 314 | } |
| 315 | defer resp.Body.Close() |
| 316 | |
| 317 | // If robots.txt doesn't exist (404), allow the fetch |
| 318 | if resp.StatusCode == http.StatusNotFound { |
| 319 | return nil, nil |
| 320 | } |
| 321 | |
| 322 | // For other non-200 status codes, fail the fetch |
| 323 | if resp.StatusCode != http.StatusOK { |
| 324 | return nil, fmt.Errorf("unexpected status %d", resp.StatusCode) |
| 325 | } |
| 326 | |
| 327 | // Read robots.txt content (limit to 64KB) |
| 328 | robotsBody, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024)) |
| 329 | if err != nil { |
| 330 | return nil, fmt.Errorf("failed to read robots.txt: %w", err) |
| 331 | } |
| 332 | |
| 333 | // Parse robots.txt |
| 334 | robots, err := robotstxt.FromBytes(robotsBody) |
| 335 | if err != nil { |
| 336 | return nil, fmt.Errorf("failed to parse robots.txt: %w", err) |
| 337 | } |
| 338 | |
| 339 | return robots, nil |
| 340 | } |
| 341 |