| 17 | |
| 18 | |
| 19 | class WebCrawler: |
| 20 | def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False): |
| 21 | self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose) |
| 22 | self.always_by_pass_cache = always_by_pass_cache |
| 23 | self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") |
| 24 | os.makedirs(self.crawl4ai_folder, exist_ok=True) |
| 25 | os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) |
| 26 | init_db() |
| 27 | self.ready = False |
| 28 | |
| 29 | def warmup(self): |
| 30 | print("[LOG] 🌤️ Warming up the WebCrawler") |
| 31 | self.run( |
| 32 | url='https://google.com/', |
| 33 | word_count_threshold=5, |
| 34 | extraction_strategy=NoExtractionStrategy(), |
| 35 | bypass_cache=False, |
| 36 | verbose=False |
| 37 | ) |
| 38 | self.ready = True |
| 39 | print("[LOG] 🌞 WebCrawler is ready to crawl") |
| 40 | |
| 41 | def fetch_page( |
| 42 | self, |
| 43 | url_model: UrlModel, |
| 44 | provider: str = DEFAULT_PROVIDER, |
| 45 | api_token: str = None, |
| 46 | extract_blocks_flag: bool = True, |
| 47 | word_count_threshold=MIN_WORD_THRESHOLD, |
| 48 | css_selector: str = None, |
| 49 | screenshot: bool = False, |
| 50 | use_cached_html: bool = False, |
| 51 | extraction_strategy: ExtractionStrategy = None, |
| 52 | chunking_strategy: ChunkingStrategy = RegexChunking(), |
| 53 | **kwargs, |
| 54 | ) -> CrawlResult: |
| 55 | return self.run( |
| 56 | url_model.url, |
| 57 | word_count_threshold, |
| 58 | extraction_strategy or NoExtractionStrategy(), |
| 59 | chunking_strategy, |
| 60 | bypass_cache=url_model.forced, |
| 61 | css_selector=css_selector, |
| 62 | screenshot=screenshot, |
| 63 | **kwargs, |
| 64 | ) |
| 65 | pass |
| 66 | |
| 67 | def fetch_pages( |
| 68 | self, |
| 69 | url_models: List[UrlModel], |
| 70 | provider: str = DEFAULT_PROVIDER, |
| 71 | api_token: str = None, |
| 72 | extract_blocks_flag: bool = True, |
| 73 | word_count_threshold=MIN_WORD_THRESHOLD, |
| 74 | use_cached_html: bool = False, |
| 75 | css_selector: str = None, |
| 76 | screenshot: bool = False, |
no outgoing calls
searching dependent graphs…