MCPcopy
hub / github.com/unclecode/crawl4ai / run

Method run

crawl4ai/web_crawler.back.py:242–284  ·  view source on GitHub ↗
(
            self,
            url: str,
            word_count_threshold=MIN_WORD_THRESHOLD,
            extraction_strategy: ExtractionStrategy = None,
            chunking_strategy: ChunkingStrategy = RegexChunking(),
            bypass_cache: bool = False,
            css_selector: str = None,
            screenshot: bool = False,
            user_agent: str = None,
            verbose=True,
            **kwargs,
        )

Source from the content-addressed store, hash-verified

240 return results
241
242 def run(
243 self,
244 url: str,
245 word_count_threshold=MIN_WORD_THRESHOLD,
246 extraction_strategy: ExtractionStrategy = None,
247 chunking_strategy: ChunkingStrategy = RegexChunking(),
248 bypass_cache: bool = False,
249 css_selector: str = None,
250 screenshot: bool = False,
251 user_agent: str = None,
252 verbose=True,
253 **kwargs,
254 ) -> CrawlResult:
255 extraction_strategy = extraction_strategy or NoExtractionStrategy()
256 extraction_strategy.verbose = verbose
257 if not isinstance(extraction_strategy, ExtractionStrategy):
258 raise ValueError("Unsupported extraction strategy")
259 if not isinstance(chunking_strategy, ChunkingStrategy):
260 raise ValueError("Unsupported chunking strategy")
261
262 if word_count_threshold < MIN_WORD_THRESHOLD:
263 word_count_threshold = MIN_WORD_THRESHOLD
264
265 # Check cache first
266 cached = None
267 extracted_content = None
268 if not bypass_cache and not self.always_by_pass_cache:
269 cached = get_cached_url(url)
270
271 if cached:
272 html = cached[1]
273 extracted_content = cached[2]
274 if screenshot:
275 screenshot = cached[9]
276
277 else:
278 if user_agent:
279 self.crawler_strategy.update_user_agent(user_agent)
280 html = self.crawler_strategy.crawl(url)
281 if screenshot:
282 screenshot = self.crawler_strategy.take_screenshot()
283
284 return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
285
286 def process_html(
287 self,

Callers 4

warmupMethod · 0.95
fetch_pageMethod · 0.95
run_oldMethod · 0.45
process_htmlMethod · 0.45

Calls 7

process_htmlMethod · 0.95
RegexChunkingClass · 0.85
get_cached_urlFunction · 0.85
update_user_agentMethod · 0.45
crawlMethod · 0.45
take_screenshotMethod · 0.45

Tested by

no test coverage detected