hub / github.com/unclecode/crawl4ai / run

Method run

crawl4ai/web_crawler.py:105–164 · view source on GitHub ↗

(
            self,
            url: str,
            word_count_threshold=MIN_WORD_THRESHOLD,
            extraction_strategy: ExtractionStrategy = None,
            chunking_strategy: ChunkingStrategy = RegexChunking(),
            bypass_cache: bool = False,
            css_selector: str = None,
            screenshot: bool = False,
            user_agent: str = None,
            verbose=True,
            **kwargs,
        )

Source from the content-addressed store, hash-verified

103	return results
104
105	def run(
106	self,
107	url: str,
108	word_count_threshold=MIN_WORD_THRESHOLD,
109	extraction_strategy: ExtractionStrategy = None,
110	chunking_strategy: ChunkingStrategy = RegexChunking(),
111	bypass_cache: bool = False,
112	css_selector: str = None,
113	screenshot: bool = False,
114	user_agent: str = None,
115	verbose=True,
116	**kwargs,
117	) -> CrawlResult:
118	try:
119	extraction_strategy = extraction_strategy or NoExtractionStrategy()
120	extraction_strategy.verbose = verbose
121	if not isinstance(extraction_strategy, ExtractionStrategy):
122	raise ValueError("Unsupported extraction strategy")
123	if not isinstance(chunking_strategy, ChunkingStrategy):
124	raise ValueError("Unsupported chunking strategy")
125
126	word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
127
128	cached = None
129	screenshot_data = None
130	extracted_content = None
131	if not bypass_cache and not self.always_by_pass_cache:
132	cached = get_cached_url(url)
133
134	if kwargs.get("warmup", True) and not self.ready:
135	return None
136
137	if cached:
138	html = sanitize_input_encode(cached[1])
139	extracted_content = sanitize_input_encode(cached[4])
140	if screenshot:
141	screenshot_data = cached[9]
142	if not screenshot_data:
143	cached = None
144
145	if not cached or not html:
146	if user_agent:
147	self.crawler_strategy.update_user_agent(user_agent)
148	t1 = time.time()
149	html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
150	t2 = time.time()
151	if verbose:
152	print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds")
153	if screenshot:
154	screenshot_data = self.crawler_strategy.take_screenshot()
155
156
157	crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
158	crawl_result.success = bool(html)
159	return crawl_result
160	except Exception as e:
161	if not hasattr(e, "msg"):
162	e.msg = str(e)

Callers 7

warmupMethod · 0.95

fetch_pageMethod · 0.95

using_crawler_hooksFunction · 0.95

using_crawler_hooks_dleay_exampleFunction · 0.95

load_spacy_modelFunction · 0.45

aprocess_htmlMethod · 0.45

process_htmlMethod · 0.45

Calls 9

process_htmlMethod · 0.95

RegexChunkingClass · 0.85

NoExtractionStrategyClass · 0.85

get_cached_urlFunction · 0.85

sanitize_input_encodeFunction · 0.85

CrawlResultClass · 0.85

update_user_agentMethod · 0.45

crawlMethod · 0.45

take_screenshotMethod · 0.45

Tested by

no test coverage detected