MCPcopy
hub / github.com/unclecode/crawl4ai / process_html

Method process_html

crawl4ai/web_crawler.back.py:286–357  ·  view source on GitHub ↗
(
            self,
            url: str,
            html: str,
            extracted_content: str,
            word_count_threshold: int,
            extraction_strategy: ExtractionStrategy,
            chunking_strategy: ChunkingStrategy,
            css_selector: str,
            screenshot: bool,
            verbose: bool,
            is_cached: bool,
            **kwargs,
        )

Source from the content-addressed store, hash-verified

284 return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
285
286 def process_html(
287 self,
288 url: str,
289 html: str,
290 extracted_content: str,
291 word_count_threshold: int,
292 extraction_strategy: ExtractionStrategy,
293 chunking_strategy: ChunkingStrategy,
294 css_selector: str,
295 screenshot: bool,
296 verbose: bool,
297 is_cached: bool,
298 **kwargs,
299 ) -> CrawlResult:
300 t = time.time()
301 # Extract content from HTML
302 try:
303 result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
304 metadata = extract_metadata(html)
305 if result is None:
306 raise ValueError(f"Failed to extract content from the website: {url}")
307 except InvalidCSSSelectorError as e:
308 raise ValueError(str(e))
309
310 cleaned_html = result.get("cleaned_html", "")
311 markdown = result.get("markdown", "")
312 media = result.get("media", [])
313 links = result.get("links", [])
314
315 if verbose:
316 print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
317
318 if extracted_content is None:
319 if verbose:
320 print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
321
322 sections = chunking_strategy.chunk(markdown)
323 extracted_content = extraction_strategy.run(url, sections)
324 extracted_content = json.dumps(extracted_content)
325
326 if verbose:
327 print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
328
329 screenshot = None if not screenshot else screenshot
330
331 if not is_cached:
332 cache_url(
333 url,
334 html,
335 cleaned_html,
336 markdown,
337 extracted_content,
338 True,
339 json.dumps(media),
340 json.dumps(links),
341 json.dumps(metadata),
342 screenshot=screenshot,
343 )

Callers 1

runMethod · 0.95

Calls 6

get_content_of_websiteFunction · 0.85
extract_metadataFunction · 0.85
cache_urlFunction · 0.85
CrawlResultClass · 0.85
chunkMethod · 0.45
runMethod · 0.45

Tested by

no test coverage detected