MCPcopy
hub / github.com/zakirkun/deep-eye / crawl_recursive

Method crawl_recursive

core/scanner_engine.py:207–264  ·  view source on GitHub ↗

Recursively crawl the target website using parallel workers.

(self)

Source from the content-addressed store, hash-verified

205 return []
206
207 def crawl_recursive(self) -> Set[str]:
208 """Recursively crawl the target website using parallel workers."""
209 self.state_manager.set_phase(PentestPhase.CRAWLING)
210 console.print("[bold blue]🕷️ Starting web crawler...[/bold blue]")
211
212 all_urls = set([self.target_url])
213 queue = [(self.target_url, 0)]
214 lock = threading.Lock()
215
216 with Progress(
217 SpinnerColumn(),
218 TextColumn("[progress.description]{task.description}"),
219 BarColumn(),
220 TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
221 TimeElapsedColumn(),
222 console=console
223 ) as progress:
224
225 task = progress.add_task(
226 f"[cyan]Crawling (depth: {self.depth})...",
227 total=None
228 )
229
230 with ThreadPoolExecutor(max_workers=min(self.threads, 10)) as executor:
231 while queue:
232 # Grab a batch from the queue
233 batch = []
234 while queue and len(batch) < self.threads:
235 batch.append(queue.pop(0))
236
237 # Filter out items beyond depth
238 batch = [(url, depth) for url, depth in batch if depth < self.depth]
239 if not batch:
240 continue
241
242 # Crawl batch in parallel
243 futures = {executor.submit(self.crawl, url, depth): (url, depth) for url, depth in batch}
244
245 for future in futures:
246 try:
247 new_links = future.result(timeout=self.timeout * 3)
248 _, depth = futures[future]
249 with lock:
250 for link in new_links:
251 if link not in all_urls:
252 all_urls.add(link)
253 queue.append((link, depth + 1))
254 self.state_manager.update_urls(discovered=1)
255 except Exception as e:
256 logger.debug(f"Crawl worker error: {e}")
257
258 progress.update(
259 task,
260 description=f"[cyan]Crawling... Found {len(all_urls)} URLs"
261 )
262
263 console.print(f"[green]✓[/green] Crawling complete. Found {len(all_urls)} URLs\n")
264 return all_urls

Callers 1

scanMethod · 0.95

Calls 2

set_phaseMethod · 0.80
update_urlsMethod · 0.80

Tested by

no test coverage detected