(
self,
url: str,
html: str,
extracted_content: str,
word_count_threshold: int,
extraction_strategy: ExtractionStrategy,
chunking_strategy: ChunkingStrategy,
css_selector: str,
screenshot: bool,
verbose: bool,
is_cached: bool,
**kwargs,
)
| 284 | return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs) |
| 285 | |
| 286 | def process_html( |
| 287 | self, |
| 288 | url: str, |
| 289 | html: str, |
| 290 | extracted_content: str, |
| 291 | word_count_threshold: int, |
| 292 | extraction_strategy: ExtractionStrategy, |
| 293 | chunking_strategy: ChunkingStrategy, |
| 294 | css_selector: str, |
| 295 | screenshot: bool, |
| 296 | verbose: bool, |
| 297 | is_cached: bool, |
| 298 | **kwargs, |
| 299 | ) -> CrawlResult: |
| 300 | t = time.time() |
| 301 | # Extract content from HTML |
| 302 | try: |
| 303 | result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector) |
| 304 | metadata = extract_metadata(html) |
| 305 | if result is None: |
| 306 | raise ValueError(f"Failed to extract content from the website: {url}") |
| 307 | except InvalidCSSSelectorError as e: |
| 308 | raise ValueError(str(e)) |
| 309 | |
| 310 | cleaned_html = result.get("cleaned_html", "") |
| 311 | markdown = result.get("markdown", "") |
| 312 | media = result.get("media", []) |
| 313 | links = result.get("links", []) |
| 314 | |
| 315 | if verbose: |
| 316 | print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds") |
| 317 | |
| 318 | if extracted_content is None: |
| 319 | if verbose: |
| 320 | print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}") |
| 321 | |
| 322 | sections = chunking_strategy.chunk(markdown) |
| 323 | extracted_content = extraction_strategy.run(url, sections) |
| 324 | extracted_content = json.dumps(extracted_content) |
| 325 | |
| 326 | if verbose: |
| 327 | print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.") |
| 328 | |
| 329 | screenshot = None if not screenshot else screenshot |
| 330 | |
| 331 | if not is_cached: |
| 332 | cache_url( |
| 333 | url, |
| 334 | html, |
| 335 | cleaned_html, |
| 336 | markdown, |
| 337 | extracted_content, |
| 338 | True, |
| 339 | json.dumps(media), |
| 340 | json.dumps(links), |
| 341 | json.dumps(metadata), |
| 342 | screenshot=screenshot, |
| 343 | ) |
no test coverage detected