hub / github.com/unclecode/crawl4ai / get_content_of_website_optimized

Function get_content_of_website_optimized

crawl4ai/utils.py:492–734 · view source on GitHub ↗

(url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs)

Source from the content-addressed store, hash-verified

490	raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
491
492	def get_content_of_website_optimized(url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
493	if not html:
494	return None
495
496	soup = BeautifulSoup(html, 'html.parser')
497	body = soup.body
498
499	image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
500
501	for tag in kwargs.get('excluded_tags', []) or []:
502	for el in body.select(tag):
503	el.decompose()
504
505	if css_selector:
506	selected_elements = body.select(css_selector)
507	if not selected_elements:
508	raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
509	body = soup.new_tag('div')
510	for el in selected_elements:
511	body.append(el)
512
513	links = {'internal': [], 'external': []}
514	media = {'images': [], 'videos': [], 'audios': []}
515
516	# Extract meaningful text for media files from closest parent
517	def find_closest_parent_with_useful_text(tag):
518	current_tag = tag
519	while current_tag:
520	current_tag = current_tag.parent
521	# Get the text content from the parent tag
522	if current_tag:
523	text_content = current_tag.get_text(separator=' ',strip=True)
524	# Check if the text content has at least word_count_threshold
525	if len(text_content.split()) >= image_description_min_word_threshold:
526	return text_content
527	return None
528
529	def process_image(img, url, index, total_images):
530	#Check if an image has valid display and inside undesired html elements
531	def is_valid_image(img, parent, parent_classes):
532	style = img.get('style', '')
533	src = img.get('src', '')
534	classes_to_check = ['button', 'icon', 'logo']
535	tags_to_check = ['button', 'input']
536	return all([
537	'display:none' not in style,
538	src,
539	not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
540	parent.name not in tags_to_check
541	])
542
543	#Score an image for it's usefulness
544	def score_image_for_usefulness(img, base_url, index, images_count):
545	# Function to parse image height/width value and units
546	def parse_dimension(dimension):
547	if dimension:
548	match = re.match(r"(\d+)(\D*)", dimension)
549	if match:

Callers 1

process_htmlMethod · 0.85

Calls 9

InvalidCSSSelectorErrorClass · 0.85

process_imageFunction · 0.85

process_elementFunction · 0.85

flatten_nested_elementsFunction · 0.85

sanitize_htmlFunction · 0.85

CustomHTML2TextClass · 0.85

extract_metadataFunction · 0.85

compileMethod · 0.80

handleMethod · 0.80

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…