| 60 | print("[LOG] 🌞 AsyncWebCrawler is ready to crawl") |
| 61 | |
| 62 | async def arun( |
| 63 | self, |
| 64 | url: str, |
| 65 | word_count_threshold=MIN_WORD_THRESHOLD, |
| 66 | extraction_strategy: ExtractionStrategy = None, |
| 67 | chunking_strategy: ChunkingStrategy = RegexChunking(), |
| 68 | bypass_cache: bool = False, |
| 69 | css_selector: str = None, |
| 70 | screenshot: bool = False, |
| 71 | user_agent: str = None, |
| 72 | verbose=True, |
| 73 | **kwargs, |
| 74 | ) -> CrawlResult: |
| 75 | try: |
| 76 | extraction_strategy = extraction_strategy or NoExtractionStrategy() |
| 77 | extraction_strategy.verbose = verbose |
| 78 | if not isinstance(extraction_strategy, ExtractionStrategy): |
| 79 | raise ValueError("Unsupported extraction strategy") |
| 80 | if not isinstance(chunking_strategy, ChunkingStrategy): |
| 81 | raise ValueError("Unsupported chunking strategy") |
| 82 | |
| 83 | word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD) |
| 84 | |
| 85 | async_response: AsyncCrawlResponse = None |
| 86 | cached = None |
| 87 | screenshot_data = None |
| 88 | extracted_content = None |
| 89 | if not bypass_cache and not self.always_by_pass_cache: |
| 90 | cached = await async_db_manager.aget_cached_url(url) |
| 91 | |
| 92 | if kwargs.get("warmup", True) and not self.ready: |
| 93 | return None |
| 94 | |
| 95 | if cached: |
| 96 | html = sanitize_input_encode(cached[1]) |
| 97 | extracted_content = sanitize_input_encode(cached[4]) |
| 98 | if screenshot: |
| 99 | screenshot_data = cached[9] |
| 100 | if not screenshot_data: |
| 101 | cached = None |
| 102 | |
| 103 | if not cached or not html: |
| 104 | t1 = time.time() |
| 105 | if user_agent: |
| 106 | self.crawler_strategy.update_user_agent(user_agent) |
| 107 | async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs) |
| 108 | html = sanitize_input_encode(async_response.html) |
| 109 | screenshot_data = async_response.screenshot |
| 110 | t2 = time.time() |
| 111 | if verbose: |
| 112 | print( |
| 113 | f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds" |
| 114 | ) |
| 115 | |
| 116 | crawl_result = await self.aprocess_html( |
| 117 | url, |
| 118 | html, |
| 119 | extracted_content, |