hub / github.com/unclecode/crawl4ai / process_html

Method process_html

crawl4ai/web_crawler.back.py:286–357 · view source on GitHub ↗

(
            self,
            url: str,
            html: str,
            extracted_content: str,
            word_count_threshold: int,
            extraction_strategy: ExtractionStrategy,
            chunking_strategy: ChunkingStrategy,
            css_selector: str,
            screenshot: bool,
            verbose: bool,
            is_cached: bool,
            **kwargs,
        )

Source from the content-addressed store, hash-verified

284	return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
285
286	def process_html(
287	self,
288	url: str,
289	html: str,
290	extracted_content: str,
291	word_count_threshold: int,
292	extraction_strategy: ExtractionStrategy,
293	chunking_strategy: ChunkingStrategy,
294	css_selector: str,
295	screenshot: bool,
296	verbose: bool,
297	is_cached: bool,
298	**kwargs,
299	) -> CrawlResult:
300	t = time.time()
301	# Extract content from HTML
302	try:
303	result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
304	metadata = extract_metadata(html)
305	if result is None:
306	raise ValueError(f"Failed to extract content from the website: {url}")
307	except InvalidCSSSelectorError as e:
308	raise ValueError(str(e))
309
310	cleaned_html = result.get("cleaned_html", "")
311	markdown = result.get("markdown", "")
312	media = result.get("media", [])
313	links = result.get("links", [])
314
315	if verbose:
316	print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
317
318	if extracted_content is None:
319	if verbose:
320	print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
321
322	sections = chunking_strategy.chunk(markdown)
323	extracted_content = extraction_strategy.run(url, sections)
324	extracted_content = json.dumps(extracted_content)
325
326	if verbose:
327	print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
328
329	screenshot = None if not screenshot else screenshot
330
331	if not is_cached:
332	cache_url(
333	url,
334	html,
335	cleaned_html,
336	markdown,
337	extracted_content,
338	True,
339	json.dumps(media),
340	json.dumps(links),
341	json.dumps(metadata),
342	screenshot=screenshot,
343	)

Callers 1

runMethod · 0.95

Calls 6

get_content_of_websiteFunction · 0.85

extract_metadataFunction · 0.85

cache_urlFunction · 0.85

CrawlResultClass · 0.85

chunkMethod · 0.45

runMethod · 0.45

Tested by

no test coverage detected