MCPcopy
hub / github.com/unclecode/crawl4ai / WebCrawler

Class WebCrawler

crawl4ai/web_crawler.py:19–238  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

17
18
19class WebCrawler:
20 def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
21 self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
22 self.always_by_pass_cache = always_by_pass_cache
23 self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
24 os.makedirs(self.crawl4ai_folder, exist_ok=True)
25 os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
26 init_db()
27 self.ready = False
28
29 def warmup(self):
30 print("[LOG] 🌤️ Warming up the WebCrawler")
31 self.run(
32 url='https://google.com/',
33 word_count_threshold=5,
34 extraction_strategy=NoExtractionStrategy(),
35 bypass_cache=False,
36 verbose=False
37 )
38 self.ready = True
39 print("[LOG] 🌞 WebCrawler is ready to crawl")
40
41 def fetch_page(
42 self,
43 url_model: UrlModel,
44 provider: str = DEFAULT_PROVIDER,
45 api_token: str = None,
46 extract_blocks_flag: bool = True,
47 word_count_threshold=MIN_WORD_THRESHOLD,
48 css_selector: str = None,
49 screenshot: bool = False,
50 use_cached_html: bool = False,
51 extraction_strategy: ExtractionStrategy = None,
52 chunking_strategy: ChunkingStrategy = RegexChunking(),
53 **kwargs,
54 ) -> CrawlResult:
55 return self.run(
56 url_model.url,
57 word_count_threshold,
58 extraction_strategy or NoExtractionStrategy(),
59 chunking_strategy,
60 bypass_cache=url_model.forced,
61 css_selector=css_selector,
62 screenshot=screenshot,
63 **kwargs,
64 )
65 pass
66
67 def fetch_pages(
68 self,
69 url_models: List[UrlModel],
70 provider: str = DEFAULT_PROVIDER,
71 api_token: str = None,
72 extract_blocks_flag: bool = True,
73 word_count_threshold=MIN_WORD_THRESHOLD,
74 use_cached_html: bool = False,
75 css_selector: str = None,
76 screenshot: bool = False,

Callers 6

get_crawlerFunction · 0.90
setUpMethod · 0.90
summarize_page.pyFile · 0.90
create_crawlerFunction · 0.90
using_crawler_hooksFunction · 0.90

Calls

no outgoing calls

Tested by 1

setUpMethod · 0.72

Used in the wild real call sites across dependent graphs

searching dependent graphs…