(
self,
url: str,
word_count_threshold=MIN_WORD_THRESHOLD,
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
bypass_cache: bool = False,
css_selector: str = None,
screenshot: bool = False,
user_agent: str = None,
verbose=True,
**kwargs,
)
| 240 | return results |
| 241 | |
| 242 | def run( |
| 243 | self, |
| 244 | url: str, |
| 245 | word_count_threshold=MIN_WORD_THRESHOLD, |
| 246 | extraction_strategy: ExtractionStrategy = None, |
| 247 | chunking_strategy: ChunkingStrategy = RegexChunking(), |
| 248 | bypass_cache: bool = False, |
| 249 | css_selector: str = None, |
| 250 | screenshot: bool = False, |
| 251 | user_agent: str = None, |
| 252 | verbose=True, |
| 253 | **kwargs, |
| 254 | ) -> CrawlResult: |
| 255 | extraction_strategy = extraction_strategy or NoExtractionStrategy() |
| 256 | extraction_strategy.verbose = verbose |
| 257 | if not isinstance(extraction_strategy, ExtractionStrategy): |
| 258 | raise ValueError("Unsupported extraction strategy") |
| 259 | if not isinstance(chunking_strategy, ChunkingStrategy): |
| 260 | raise ValueError("Unsupported chunking strategy") |
| 261 | |
| 262 | if word_count_threshold < MIN_WORD_THRESHOLD: |
| 263 | word_count_threshold = MIN_WORD_THRESHOLD |
| 264 | |
| 265 | # Check cache first |
| 266 | cached = None |
| 267 | extracted_content = None |
| 268 | if not bypass_cache and not self.always_by_pass_cache: |
| 269 | cached = get_cached_url(url) |
| 270 | |
| 271 | if cached: |
| 272 | html = cached[1] |
| 273 | extracted_content = cached[2] |
| 274 | if screenshot: |
| 275 | screenshot = cached[9] |
| 276 | |
| 277 | else: |
| 278 | if user_agent: |
| 279 | self.crawler_strategy.update_user_agent(user_agent) |
| 280 | html = self.crawler_strategy.crawl(url) |
| 281 | if screenshot: |
| 282 | screenshot = self.crawler_strategy.take_screenshot() |
| 283 | |
| 284 | return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs) |
| 285 | |
| 286 | def process_html( |
| 287 | self, |
no test coverage detected