hub / github.com/unclecode/crawl4ai / WebCrawler

Class WebCrawler

crawl4ai/web_crawler.py:19–238 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

17
18
19	class WebCrawler:
20	def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
21	self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
22	self.always_by_pass_cache = always_by_pass_cache
23	self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
24	os.makedirs(self.crawl4ai_folder, exist_ok=True)
25	os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
26	init_db()
27	self.ready = False
28
29	def warmup(self):
30	print("[LOG] 🌤️ Warming up the WebCrawler")
31	self.run(
32	url='https://google.com/',
33	word_count_threshold=5,
34	extraction_strategy=NoExtractionStrategy(),
35	bypass_cache=False,
36	verbose=False
37	)
38	self.ready = True
39	print("[LOG] 🌞 WebCrawler is ready to crawl")
40
41	def fetch_page(
42	self,
43	url_model: UrlModel,
44	provider: str = DEFAULT_PROVIDER,
45	api_token: str = None,
46	extract_blocks_flag: bool = True,
47	word_count_threshold=MIN_WORD_THRESHOLD,
48	css_selector: str = None,
49	screenshot: bool = False,
50	use_cached_html: bool = False,
51	extraction_strategy: ExtractionStrategy = None,
52	chunking_strategy: ChunkingStrategy = RegexChunking(),
53	**kwargs,
54	) -> CrawlResult:
55	return self.run(
56	url_model.url,
57	word_count_threshold,
58	extraction_strategy or NoExtractionStrategy(),
59	chunking_strategy,
60	bypass_cache=url_model.forced,
61	css_selector=css_selector,
62	screenshot=screenshot,
63	**kwargs,
64	)
65	pass
66
67	def fetch_pages(
68	self,
69	url_models: List[UrlModel],
70	provider: str = DEFAULT_PROVIDER,
71	api_token: str = None,
72	extract_blocks_flag: bool = True,
73	word_count_threshold=MIN_WORD_THRESHOLD,
74	use_cached_html: bool = False,
75	css_selector: str = None,
76	screenshot: bool = False,

Callers 6

get_crawlerFunction · 0.90

setUpMethod · 0.90

summarize_page.pyFile · 0.90

create_crawlerFunction · 0.90

using_crawler_hooksFunction · 0.90

llm_extraction_openai_pricing.pyFile · 0.90

Calls

no outgoing calls

Tested by 1

setUpMethod · 0.72

Used in the wild real call sites across dependent graphs

searching dependent graphs…