MCPcopy
hub / github.com/unclecode/crawl4ai / arun

Method arun

crawl4ai/async_webcrawler.py:62–139  ·  view source on GitHub ↗
(
        self,
        url: str,
        word_count_threshold=MIN_WORD_THRESHOLD,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        bypass_cache: bool = False,
        css_selector: str = None,
        screenshot: bool = False,
        user_agent: str = None,
        verbose=True,
        **kwargs,
    )

Source from the content-addressed store, hash-verified

60 print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
61
62 async def arun(
63 self,
64 url: str,
65 word_count_threshold=MIN_WORD_THRESHOLD,
66 extraction_strategy: ExtractionStrategy = None,
67 chunking_strategy: ChunkingStrategy = RegexChunking(),
68 bypass_cache: bool = False,
69 css_selector: str = None,
70 screenshot: bool = False,
71 user_agent: str = None,
72 verbose=True,
73 **kwargs,
74 ) -> CrawlResult:
75 try:
76 extraction_strategy = extraction_strategy or NoExtractionStrategy()
77 extraction_strategy.verbose = verbose
78 if not isinstance(extraction_strategy, ExtractionStrategy):
79 raise ValueError("Unsupported extraction strategy")
80 if not isinstance(chunking_strategy, ChunkingStrategy):
81 raise ValueError("Unsupported chunking strategy")
82
83 word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
84
85 async_response: AsyncCrawlResponse = None
86 cached = None
87 screenshot_data = None
88 extracted_content = None
89 if not bypass_cache and not self.always_by_pass_cache:
90 cached = await async_db_manager.aget_cached_url(url)
91
92 if kwargs.get("warmup", True) and not self.ready:
93 return None
94
95 if cached:
96 html = sanitize_input_encode(cached[1])
97 extracted_content = sanitize_input_encode(cached[4])
98 if screenshot:
99 screenshot_data = cached[9]
100 if not screenshot_data:
101 cached = None
102
103 if not cached or not html:
104 t1 = time.time()
105 if user_agent:
106 self.crawler_strategy.update_user_agent(user_agent)
107 async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
108 html = sanitize_input_encode(async_response.html)
109 screenshot_data = async_response.screenshot
110 t2 = time.time()
111 if verbose:
112 print(
113 f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
114 )
115
116 crawl_result = await self.aprocess_html(
117 url,
118 html,
119 extracted_content,

Callers 15

awarmupMethod · 0.95
arun_manyMethod · 0.95
mainFunction · 0.95
test_extract_markdownFunction · 0.80
test_extract_mediaFunction · 0.80
test_extract_linksFunction · 0.80
test_extract_metadataFunction · 0.80
test_css_selectorFunction · 0.80

Calls 8

aprocess_htmlMethod · 0.95
RegexChunkingClass · 0.85
sanitize_input_encodeFunction · 0.85
CrawlResultClass · 0.85
aget_cached_urlMethod · 0.80
update_user_agentMethod · 0.45
crawlMethod · 0.45

Tested by 15

test_extract_markdownFunction · 0.64
test_extract_mediaFunction · 0.64
test_extract_linksFunction · 0.64
test_extract_metadataFunction · 0.64
test_css_selectorFunction · 0.64
test_screenshotFunction · 0.64
test_custom_user_agentFunction · 0.64