ExecuteCrawling executes the crawling main loop
()
| 11 | |
| 12 | // ExecuteCrawling executes the crawling main loop |
| 13 | func (r *Runner) ExecuteCrawling() error { |
| 14 | if r.crawler == nil { |
| 15 | return errkit.New("crawler is not initialized") |
| 16 | } |
| 17 | inputs := r.parseInputs() |
| 18 | if len(inputs) == 0 { |
| 19 | return errkit.New("no input provided for crawling") |
| 20 | } |
| 21 | |
| 22 | for _, input := range inputs { |
| 23 | _ = r.state.InFlightUrls.Set(addSchemeIfNotExists(input), struct{}{}) |
| 24 | } |
| 25 | |
| 26 | defer func() { |
| 27 | if err := r.crawler.Close(); err != nil { |
| 28 | gologger.Error().Msgf("Error closing crawler: %v\n", err) |
| 29 | } |
| 30 | }() |
| 31 | |
| 32 | wg := sizedwaitgroup.New(r.options.Parallelism) |
| 33 | for _, input := range inputs { |
| 34 | if !r.networkpolicy.Validate(input) { |
| 35 | gologger.Info().Msgf("Skipping excluded host %s", input) |
| 36 | continue |
| 37 | } |
| 38 | wg.Add() |
| 39 | input = addSchemeIfNotExists(input) |
| 40 | go func(input string) { |
| 41 | defer wg.Done() |
| 42 | |
| 43 | if err := r.crawler.Crawl(input); err != nil { |
| 44 | gologger.Warning().Msgf("Could not crawl %s: %s", input, err) |
| 45 | } |
| 46 | r.state.InFlightUrls.Delete(input) |
| 47 | }(input) |
| 48 | } |
| 49 | wg.Wait() |
| 50 | return nil |
| 51 | } |
| 52 | |
| 53 | // scheme less urls are skipped and are required for headless mode and other purposes |
| 54 | // this method adds scheme if given input does not have any |
no test coverage detected