| 19 | |
| 20 | |
| 21 | class AsyncWebCrawler: |
| 22 | def __init__( |
| 23 | self, |
| 24 | crawler_strategy: Optional[AsyncCrawlerStrategy] = None, |
| 25 | always_by_pass_cache: bool = False, |
| 26 | base_directory: str = str(Path.home()), |
| 27 | **kwargs, |
| 28 | ): |
| 29 | self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( |
| 30 | **kwargs |
| 31 | ) |
| 32 | self.always_by_pass_cache = always_by_pass_cache |
| 33 | # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") |
| 34 | self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") |
| 35 | os.makedirs(self.crawl4ai_folder, exist_ok=True) |
| 36 | os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) |
| 37 | self.ready = False |
| 38 | self.verbose = kwargs.get("verbose", False) |
| 39 | |
| 40 | async def __aenter__(self): |
| 41 | await self.crawler_strategy.__aenter__() |
| 42 | await self.awarmup() |
| 43 | return self |
| 44 | |
| 45 | async def __aexit__(self, exc_type, exc_val, exc_tb): |
| 46 | await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) |
| 47 | |
| 48 | async def awarmup(self): |
| 49 | if self.verbose: |
| 50 | print("[LOG] 🌤️ Warming up the AsyncWebCrawler") |
| 51 | await async_db_manager.ainit_db() |
| 52 | await self.arun( |
| 53 | url="https://google.com/", |
| 54 | word_count_threshold=5, |
| 55 | bypass_cache=False, |
| 56 | verbose=False, |
| 57 | ) |
| 58 | self.ready = True |
| 59 | if self.verbose: |
| 60 | print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") |
| 61 | |
| 62 | async def arun( |
| 63 | self, |
| 64 | url: str, |
| 65 | word_count_threshold=MIN_WORD_THRESHOLD, |
| 66 | extraction_strategy: ExtractionStrategy = None, |
| 67 | chunking_strategy: ChunkingStrategy = RegexChunking(), |
| 68 | bypass_cache: bool = False, |
| 69 | css_selector: str = None, |
| 70 | screenshot: bool = False, |
| 71 | user_agent: str = None, |
| 72 | verbose=True, |
| 73 | **kwargs, |
| 74 | ) -> CrawlResult: |
| 75 | try: |
| 76 | extraction_strategy = extraction_strategy or NoExtractionStrategy() |
| 77 | extraction_strategy.verbose = verbose |
| 78 | if not isinstance(extraction_strategy, ExtractionStrategy): |
no outgoing calls
searching dependent graphs…