MCPcopy
hub / github.com/unclecode/crawl4ai / AsyncWebCrawler

Class AsyncWebCrawler

crawl4ai/async_webcrawler.py:21–283  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

19
20
21class AsyncWebCrawler:
22 def __init__(
23 self,
24 crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
25 always_by_pass_cache: bool = False,
26 base_directory: str = str(Path.home()),
27 **kwargs,
28 ):
29 self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
30 **kwargs
31 )
32 self.always_by_pass_cache = always_by_pass_cache
33 # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
34 self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
35 os.makedirs(self.crawl4ai_folder, exist_ok=True)
36 os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
37 self.ready = False
38 self.verbose = kwargs.get("verbose", False)
39
40 async def __aenter__(self):
41 await self.crawler_strategy.__aenter__()
42 await self.awarmup()
43 return self
44
45 async def __aexit__(self, exc_type, exc_val, exc_tb):
46 await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
47
48 async def awarmup(self):
49 if self.verbose:
50 print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
51 await async_db_manager.ainit_db()
52 await self.arun(
53 url="https://google.com/",
54 word_count_threshold=5,
55 bypass_cache=False,
56 verbose=False,
57 )
58 self.ready = True
59 if self.verbose:
60 print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
61
62 async def arun(
63 self,
64 url: str,
65 word_count_threshold=MIN_WORD_THRESHOLD,
66 extraction_strategy: ExtractionStrategy = None,
67 chunking_strategy: ChunkingStrategy = RegexChunking(),
68 bypass_cache: bool = False,
69 css_selector: str = None,
70 screenshot: bool = False,
71 user_agent: str = None,
72 verbose=True,
73 **kwargs,
74 ) -> CrawlResult:
75 try:
76 extraction_strategy = extraction_strategy or NoExtractionStrategy()
77 extraction_strategy.verbose = verbose
78 if not isinstance(extraction_strategy, ExtractionStrategy):

Callers 15

test_extract_markdownFunction · 0.90
test_extract_mediaFunction · 0.90
test_extract_linksFunction · 0.90
test_extract_metadataFunction · 0.90
test_css_selectorFunction · 0.90
test_screenshotFunction · 0.90
test_custom_user_agentFunction · 0.90

Calls

no outgoing calls

Tested by 15

test_extract_markdownFunction · 0.72
test_extract_mediaFunction · 0.72
test_extract_linksFunction · 0.72
test_extract_metadataFunction · 0.72
test_css_selectorFunction · 0.72
test_screenshotFunction · 0.72
test_custom_user_agentFunction · 0.72

Used in the wild real call sites across dependent graphs

searching dependent graphs…