MCPcopy
hub / github.com/unclecode/crawl4ai / process_image

Function process_image

crawl4ai/utils.py:529–611  ·  view source on GitHub ↗
(img, url, index, total_images)

Source from the content-addressed store, hash-verified

527 return None
528
529 def process_image(img, url, index, total_images):
530 #Check if an image has valid display and inside undesired html elements
531 def is_valid_image(img, parent, parent_classes):
532 style = img.get('style', '')
533 src = img.get('src', '')
534 classes_to_check = ['button', 'icon', 'logo']
535 tags_to_check = ['button', 'input']
536 return all([
537 'display:none' not in style,
538 src,
539 not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
540 parent.name not in tags_to_check
541 ])
542
543 #Score an image for it's usefulness
544 def score_image_for_usefulness(img, base_url, index, images_count):
545 # Function to parse image height/width value and units
546 def parse_dimension(dimension):
547 if dimension:
548 match = re.match(r"(\d+)(\D*)", dimension)
549 if match:
550 number = int(match.group(1))
551 unit = match.group(2) or 'px' # Default unit is 'px' if not specified
552 return number, unit
553 return None, None
554
555 # Fetch image file metadata to extract size and extension
556 def fetch_image_file_size(img, base_url):
557 #If src is relative path construct full URL, if not it may be CDN URL
558 img_url = urljoin(base_url,img.get('src'))
559 try:
560 response = requests.head(img_url)
561 if response.status_code == 200:
562 return response.headers.get('Content-Length',None)
563 else:
564 print(f"Failed to retrieve file size for {img_url}")
565 return None
566 except InvalidSchema as e:
567 return None
568 finally:
569 return
570
571 image_height = img.get('height')
572 height_value, height_unit = parse_dimension(image_height)
573 image_width = img.get('width')
574 width_value, width_unit = parse_dimension(image_width)
575 image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
576 image_format = os.path.splitext(img.get('src',''))[1].lower()
577 # Remove . from format
578 image_format = image_format.strip('.')
579 score = 0
580 if height_value:
581 if height_unit == 'px' and height_value > 150:
582 score += 1
583 if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
584 score += 1
585 if width_value:
586 if width_unit == 'px' and width_value > 150:

Callers 1

Calls 3

is_valid_imageFunction · 0.85

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…