hub / github.com/unclecode/crawl4ai / arun

Method arun

crawl4ai/async_webcrawler.py:62–139 · view source on GitHub ↗

(
        self,
        url: str,
        word_count_threshold=MIN_WORD_THRESHOLD,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        bypass_cache: bool = False,
        css_selector: str = None,
        screenshot: bool = False,
        user_agent: str = None,
        verbose=True,
        **kwargs,
    )

Source from the content-addressed store, hash-verified

60	print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
61
62	async def arun(
63	self,
64	url: str,
65	word_count_threshold=MIN_WORD_THRESHOLD,
66	extraction_strategy: ExtractionStrategy = None,
67	chunking_strategy: ChunkingStrategy = RegexChunking(),
68	bypass_cache: bool = False,
69	css_selector: str = None,
70	screenshot: bool = False,
71	user_agent: str = None,
72	verbose=True,
73	**kwargs,
74	) -> CrawlResult:
75	try:
76	extraction_strategy = extraction_strategy or NoExtractionStrategy()
77	extraction_strategy.verbose = verbose
78	if not isinstance(extraction_strategy, ExtractionStrategy):
79	raise ValueError("Unsupported extraction strategy")
80	if not isinstance(chunking_strategy, ChunkingStrategy):
81	raise ValueError("Unsupported chunking strategy")
82
83	word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
84
85	async_response: AsyncCrawlResponse = None
86	cached = None
87	screenshot_data = None
88	extracted_content = None
89	if not bypass_cache and not self.always_by_pass_cache:
90	cached = await async_db_manager.aget_cached_url(url)
91
92	if kwargs.get("warmup", True) and not self.ready:
93	return None
94
95	if cached:
96	html = sanitize_input_encode(cached[1])
97	extracted_content = sanitize_input_encode(cached[4])
98	if screenshot:
99	screenshot_data = cached[9]
100	if not screenshot_data:
101	cached = None
102
103	if not cached or not html:
104	t1 = time.time()
105	if user_agent:
106	self.crawler_strategy.update_user_agent(user_agent)
107	async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
108	html = sanitize_input_encode(async_response.html)
109	screenshot_data = async_response.screenshot
110	t2 = time.time()
111	if verbose:
112	print(
113	f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
114	)
115
116	crawl_result = await self.aprocess_html(
117	url,
118	html,
119	extracted_content,

Callers 15

awarmupMethod · 0.95

arun_manyMethod · 0.95

mainFunction · 0.95

test_extract_markdownFunction · 0.80

test_extract_cleaned_htmlFunction · 0.80

test_extract_mediaFunction · 0.80

test_extract_linksFunction · 0.80

test_extract_metadataFunction · 0.80

test_css_selector_extractionFunction · 0.80

test_word_count_thresholdFunction · 0.80

test_css_selectorFunction · 0.80

test_javascript_executionFunction · 0.80

Calls 8

aprocess_htmlMethod · 0.95

RegexChunkingClass · 0.85

NoExtractionStrategyClass · 0.85

sanitize_input_encodeFunction · 0.85

CrawlResultClass · 0.85

aget_cached_urlMethod · 0.80

update_user_agentMethod · 0.45

crawlMethod · 0.45

Tested by 15

test_extract_markdownFunction · 0.64

test_extract_cleaned_htmlFunction · 0.64

test_extract_mediaFunction · 0.64

test_extract_linksFunction · 0.64

test_extract_metadataFunction · 0.64

test_css_selector_extractionFunction · 0.64

test_word_count_thresholdFunction · 0.64

test_css_selectorFunction · 0.64

test_javascript_executionFunction · 0.64

test_screenshotFunction · 0.64

test_custom_user_agentFunction · 0.64

test_extract_media_and_linksFunction · 0.64