Do executes the main crawling loop for the given crawl session. It processes items from the queue concurrently (respecting the Concurrency limit), validates each request (URL format, path filters, scope), applies rate limiting and delays, executes the request using the provided doRequest function, w
(crawlSession *CrawlSession, doRequest DoRequestFunc)
| 399 | // The method returns when the queue is empty or the session context is cancelled |
| 400 | // (due to timeout or manual cancellation). Returns an error if the context is cancelled. |
| 401 | func (s *Shared) Do(crawlSession *CrawlSession, doRequest DoRequestFunc) error { |
| 402 | wg := sizedwaitgroup.New(s.Options.Options.Concurrency) |
| 403 | for item := range crawlSession.Queue.Pop() { |
| 404 | if ctxErr := crawlSession.Ctx.Err(); ctxErr != nil { |
| 405 | return ctxErr |
| 406 | } |
| 407 | |
| 408 | req, ok := item.(*navigation.Request) |
| 409 | if !ok { |
| 410 | continue |
| 411 | } |
| 412 | |
| 413 | if !utils.IsURL(req.URL) { |
| 414 | if s.Options.Options.OnSkipURL != nil { |
| 415 | s.Options.Options.OnSkipURL(req.URL) |
| 416 | } |
| 417 | gologger.Debug().Msgf("`%v` not a url. skipping", req.URL) |
| 418 | continue |
| 419 | } |
| 420 | |
| 421 | if !s.Options.ValidatePath(req.URL) { |
| 422 | gologger.Debug().Msgf("`%v` filtered path. skipping", req.URL) |
| 423 | continue |
| 424 | } |
| 425 | |
| 426 | inScope, scopeErr := s.Options.ValidateScope(req.URL, crawlSession.Hostname) |
| 427 | if scopeErr != nil { |
| 428 | gologger.Debug().Msgf("Error validating scope for `%v`: %v. skipping", req.URL, scopeErr) |
| 429 | continue |
| 430 | } |
| 431 | if !req.SkipValidation && !inScope { |
| 432 | gologger.Debug().Msgf("`%v` not in scope. skipping", req.URL) |
| 433 | continue |
| 434 | } |
| 435 | |
| 436 | wg.Add() |
| 437 | // gologger.Debug().Msgf("Visiting: %v", req.URL) // not sure if this is needed |
| 438 | go func() { |
| 439 | defer wg.Done() |
| 440 | |
| 441 | if s.Options.HostRateLimit != nil { |
| 442 | _ = s.Options.HostRateLimit.Take(crawlSession.Hostname) |
| 443 | } else if s.Options.RateLimit != nil { |
| 444 | s.Options.RateLimit.Take() |
| 445 | } |
| 446 | s.ApplyBackoff(crawlSession.Hostname) |
| 447 | |
| 448 | // Delay if the user has asked for it |
| 449 | if s.Options.Options.Delay > 0 { |
| 450 | time.Sleep(time.Duration(s.Options.Options.Delay) * time.Second) |
| 451 | } |
| 452 | |
| 453 | if s.Options.Options.MaxDomainPages > 0 { |
| 454 | counter := s.DomainCounter(crawlSession.Hostname) |
| 455 | if counter.Add(1) > int64(s.Options.Options.MaxDomainPages) { |
| 456 | return |
| 457 | } |
| 458 | } |
no test coverage detected