MCPcopy
hub / github.com/unclecode/crawl4ai / score_image_for_usefulness

Function score_image_for_usefulness

crawl4ai/utils.py:544–598  ·  view source on GitHub ↗
(img, base_url, index, images_count)

Source from the content-addressed store, hash-verified

542
543 #Score an image for it's usefulness
544 def score_image_for_usefulness(img, base_url, index, images_count):
545 # Function to parse image height/width value and units
546 def parse_dimension(dimension):
547 if dimension:
548 match = re.match(r"(\d+)(\D*)", dimension)
549 if match:
550 number = int(match.group(1))
551 unit = match.group(2) or 'px' # Default unit is 'px' if not specified
552 return number, unit
553 return None, None
554
555 # Fetch image file metadata to extract size and extension
556 def fetch_image_file_size(img, base_url):
557 #If src is relative path construct full URL, if not it may be CDN URL
558 img_url = urljoin(base_url,img.get('src'))
559 try:
560 response = requests.head(img_url)
561 if response.status_code == 200:
562 return response.headers.get('Content-Length',None)
563 else:
564 print(f"Failed to retrieve file size for {img_url}")
565 return None
566 except InvalidSchema as e:
567 return None
568 finally:
569 return
570
571 image_height = img.get('height')
572 height_value, height_unit = parse_dimension(image_height)
573 image_width = img.get('width')
574 width_value, width_unit = parse_dimension(image_width)
575 image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
576 image_format = os.path.splitext(img.get('src',''))[1].lower()
577 # Remove . from format
578 image_format = image_format.strip('.')
579 score = 0
580 if height_value:
581 if height_unit == 'px' and height_value > 150:
582 score += 1
583 if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
584 score += 1
585 if width_value:
586 if width_unit == 'px' and width_value > 150:
587 score += 1
588 if width_unit in ['%','vh','vmin','vmax'] and width_value >30:
589 score += 1
590 if image_size > 10000:
591 score += 1
592 if img.get('alt') != '':
593 score+=1
594 if any(image_format==format for format in ['jpg','png','webp']):
595 score+=1
596 if index/images_count<0.5:
597 score+=1
598 return score
599
600 if not is_valid_image(img, img.parent, img.parent.get('class', [])):
601 return None

Callers 2

process_imageFunction · 0.85
process_imageMethod · 0.85

Calls 1

parse_dimensionFunction · 0.85

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…