(url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs)
| 490 | raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e |
| 491 | |
| 492 | def get_content_of_website_optimized(url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]: |
| 493 | if not html: |
| 494 | return None |
| 495 | |
| 496 | soup = BeautifulSoup(html, 'html.parser') |
| 497 | body = soup.body |
| 498 | |
| 499 | image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) |
| 500 | |
| 501 | for tag in kwargs.get('excluded_tags', []) or []: |
| 502 | for el in body.select(tag): |
| 503 | el.decompose() |
| 504 | |
| 505 | if css_selector: |
| 506 | selected_elements = body.select(css_selector) |
| 507 | if not selected_elements: |
| 508 | raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}") |
| 509 | body = soup.new_tag('div') |
| 510 | for el in selected_elements: |
| 511 | body.append(el) |
| 512 | |
| 513 | links = {'internal': [], 'external': []} |
| 514 | media = {'images': [], 'videos': [], 'audios': []} |
| 515 | |
| 516 | # Extract meaningful text for media files from closest parent |
| 517 | def find_closest_parent_with_useful_text(tag): |
| 518 | current_tag = tag |
| 519 | while current_tag: |
| 520 | current_tag = current_tag.parent |
| 521 | # Get the text content from the parent tag |
| 522 | if current_tag: |
| 523 | text_content = current_tag.get_text(separator=' ',strip=True) |
| 524 | # Check if the text content has at least word_count_threshold |
| 525 | if len(text_content.split()) >= image_description_min_word_threshold: |
| 526 | return text_content |
| 527 | return None |
| 528 | |
| 529 | def process_image(img, url, index, total_images): |
| 530 | #Check if an image has valid display and inside undesired html elements |
| 531 | def is_valid_image(img, parent, parent_classes): |
| 532 | style = img.get('style', '') |
| 533 | src = img.get('src', '') |
| 534 | classes_to_check = ['button', 'icon', 'logo'] |
| 535 | tags_to_check = ['button', 'input'] |
| 536 | return all([ |
| 537 | 'display:none' not in style, |
| 538 | src, |
| 539 | not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check), |
| 540 | parent.name not in tags_to_check |
| 541 | ]) |
| 542 | |
| 543 | #Score an image for it's usefulness |
| 544 | def score_image_for_usefulness(img, base_url, index, images_count): |
| 545 | # Function to parse image height/width value and units |
| 546 | def parse_dimension(dimension): |
| 547 | if dimension: |
| 548 | match = re.match(r"(\d+)(\D*)", dimension) |
| 549 | if match: |
no test coverage detected
searching dependent graphs…