(
self,
url_model: UrlModel,
provider: str = DEFAULT_PROVIDER,
api_token: str = None,
extract_blocks_flag: bool = True,
word_count_threshold=MIN_WORD_THRESHOLD,
css_selector: str = None,
screenshot: bool = False,
use_cached_html: bool = False,
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
**kwargs,
)
| 39 | print("[LOG] 🌞 WebCrawler is ready to crawl") |
| 40 | |
| 41 | def fetch_page( |
| 42 | self, |
| 43 | url_model: UrlModel, |
| 44 | provider: str = DEFAULT_PROVIDER, |
| 45 | api_token: str = None, |
| 46 | extract_blocks_flag: bool = True, |
| 47 | word_count_threshold=MIN_WORD_THRESHOLD, |
| 48 | css_selector: str = None, |
| 49 | screenshot: bool = False, |
| 50 | use_cached_html: bool = False, |
| 51 | extraction_strategy: ExtractionStrategy = None, |
| 52 | chunking_strategy: ChunkingStrategy = RegexChunking(), |
| 53 | **kwargs, |
| 54 | ) -> CrawlResult: |
| 55 | return self.run( |
| 56 | url_model.url, |
| 57 | word_count_threshold, |
| 58 | extraction_strategy or NoExtractionStrategy(), |
| 59 | chunking_strategy, |
| 60 | bypass_cache=url_model.forced, |
| 61 | css_selector=css_selector, |
| 62 | screenshot=screenshot, |
| 63 | **kwargs, |
| 64 | ) |
| 65 | pass |
| 66 | |
| 67 | def fetch_pages( |
| 68 | self, |
no test coverage detected