(img, base_url, index, images_count)
| 542 | |
| 543 | #Score an image for it's usefulness |
| 544 | def score_image_for_usefulness(img, base_url, index, images_count): |
| 545 | # Function to parse image height/width value and units |
| 546 | def parse_dimension(dimension): |
| 547 | if dimension: |
| 548 | match = re.match(r"(\d+)(\D*)", dimension) |
| 549 | if match: |
| 550 | number = int(match.group(1)) |
| 551 | unit = match.group(2) or 'px' # Default unit is 'px' if not specified |
| 552 | return number, unit |
| 553 | return None, None |
| 554 | |
| 555 | # Fetch image file metadata to extract size and extension |
| 556 | def fetch_image_file_size(img, base_url): |
| 557 | #If src is relative path construct full URL, if not it may be CDN URL |
| 558 | img_url = urljoin(base_url,img.get('src')) |
| 559 | try: |
| 560 | response = requests.head(img_url) |
| 561 | if response.status_code == 200: |
| 562 | return response.headers.get('Content-Length',None) |
| 563 | else: |
| 564 | print(f"Failed to retrieve file size for {img_url}") |
| 565 | return None |
| 566 | except InvalidSchema as e: |
| 567 | return None |
| 568 | finally: |
| 569 | return |
| 570 | |
| 571 | image_height = img.get('height') |
| 572 | height_value, height_unit = parse_dimension(image_height) |
| 573 | image_width = img.get('width') |
| 574 | width_value, width_unit = parse_dimension(image_width) |
| 575 | image_size = 0 #int(fetch_image_file_size(img,base_url) or 0) |
| 576 | image_format = os.path.splitext(img.get('src',''))[1].lower() |
| 577 | # Remove . from format |
| 578 | image_format = image_format.strip('.') |
| 579 | score = 0 |
| 580 | if height_value: |
| 581 | if height_unit == 'px' and height_value > 150: |
| 582 | score += 1 |
| 583 | if height_unit in ['%','vh','vmin','vmax'] and height_value >30: |
| 584 | score += 1 |
| 585 | if width_value: |
| 586 | if width_unit == 'px' and width_value > 150: |
| 587 | score += 1 |
| 588 | if width_unit in ['%','vh','vmin','vmax'] and width_value >30: |
| 589 | score += 1 |
| 590 | if image_size > 10000: |
| 591 | score += 1 |
| 592 | if img.get('alt') != '': |
| 593 | score+=1 |
| 594 | if any(image_format==format for format in ['jpg','png','webp']): |
| 595 | score+=1 |
| 596 | if index/images_count<0.5: |
| 597 | score+=1 |
| 598 | return score |
| 599 | |
| 600 | if not is_valid_image(img, img.parent, img.parent.get('class', [])): |
| 601 | return None |
no test coverage detected
searching dependent graphs…