MCPcopy
hub / github.com/unclecode/crawl4ai / get_content_of_website_optimized

Function get_content_of_website_optimized

crawl4ai/utils.py:492–734  ·  view source on GitHub ↗
(url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs)

Source from the content-addressed store, hash-verified

490 raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
491
492def get_content_of_website_optimized(url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
493 if not html:
494 return None
495
496 soup = BeautifulSoup(html, 'html.parser')
497 body = soup.body
498
499 image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
500
501 for tag in kwargs.get('excluded_tags', []) or []:
502 for el in body.select(tag):
503 el.decompose()
504
505 if css_selector:
506 selected_elements = body.select(css_selector)
507 if not selected_elements:
508 raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
509 body = soup.new_tag('div')
510 for el in selected_elements:
511 body.append(el)
512
513 links = {'internal': [], 'external': []}
514 media = {'images': [], 'videos': [], 'audios': []}
515
516 # Extract meaningful text for media files from closest parent
517 def find_closest_parent_with_useful_text(tag):
518 current_tag = tag
519 while current_tag:
520 current_tag = current_tag.parent
521 # Get the text content from the parent tag
522 if current_tag:
523 text_content = current_tag.get_text(separator=' ',strip=True)
524 # Check if the text content has at least word_count_threshold
525 if len(text_content.split()) >= image_description_min_word_threshold:
526 return text_content
527 return None
528
529 def process_image(img, url, index, total_images):
530 #Check if an image has valid display and inside undesired html elements
531 def is_valid_image(img, parent, parent_classes):
532 style = img.get('style', '')
533 src = img.get('src', '')
534 classes_to_check = ['button', 'icon', 'logo']
535 tags_to_check = ['button', 'input']
536 return all([
537 'display:none' not in style,
538 src,
539 not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
540 parent.name not in tags_to_check
541 ])
542
543 #Score an image for it's usefulness
544 def score_image_for_usefulness(img, base_url, index, images_count):
545 # Function to parse image height/width value and units
546 def parse_dimension(dimension):
547 if dimension:
548 match = re.match(r"(\d+)(\D*)", dimension)
549 if match:

Callers 1

process_htmlMethod · 0.85

Calls 9

process_imageFunction · 0.85
process_elementFunction · 0.85
flatten_nested_elementsFunction · 0.85
sanitize_htmlFunction · 0.85
CustomHTML2TextClass · 0.85
extract_metadataFunction · 0.85
compileMethod · 0.80
handleMethod · 0.80

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…