(element: element.PageElement)
| 611 | } |
| 612 | |
| 613 | def process_element(element: element.PageElement) -> bool: |
| 614 | try: |
| 615 | if isinstance(element, NavigableString): |
| 616 | if isinstance(element, Comment): |
| 617 | element.extract() |
| 618 | return False |
| 619 | |
| 620 | if element.name in ['script', 'style', 'link', 'meta', 'noscript']: |
| 621 | element.decompose() |
| 622 | return False |
| 623 | |
| 624 | keep_element = False |
| 625 | |
| 626 | if element.name == 'a' and element.get('href'): |
| 627 | href = element['href'] |
| 628 | url_base = url.split('/')[2] |
| 629 | link_data = {'href': href, 'text': element.get_text()} |
| 630 | if href.startswith('http') and url_base not in href: |
| 631 | links['external'].append(link_data) |
| 632 | else: |
| 633 | links['internal'].append(link_data) |
| 634 | keep_element = True |
| 635 | |
| 636 | elif element.name == 'img': |
| 637 | return True # Always keep image elements |
| 638 | |
| 639 | elif element.name in ['video', 'audio']: |
| 640 | media[f"{element.name}s"].append({ |
| 641 | 'src': element.get('src'), |
| 642 | 'alt': element.get('alt'), |
| 643 | 'type': element.name, |
| 644 | 'description': find_closest_parent_with_useful_text(element) |
| 645 | }) |
| 646 | source_tags = element.find_all('source') |
| 647 | for source_tag in source_tags: |
| 648 | media[f"{element.name}s"].append({ |
| 649 | 'src': source_tag.get('src'), |
| 650 | 'alt': element.get('alt'), |
| 651 | 'type': element.name, |
| 652 | 'description': find_closest_parent_with_useful_text(element) |
| 653 | }) |
| 654 | return True # Always keep video and audio elements |
| 655 | |
| 656 | if element.name != 'pre': |
| 657 | if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']: |
| 658 | if kwargs.get('only_text', False): |
| 659 | element.replace_with(element.get_text()) |
| 660 | else: |
| 661 | element.unwrap() |
| 662 | elif element.name != 'img': |
| 663 | element.attrs = {} |
| 664 | |
| 665 | # Process children |
| 666 | for child in list(element.children): |
| 667 | if isinstance(child, NavigableString) and not isinstance(child, Comment): |
| 668 | if len(child.strip()) > 0: |
| 669 | keep_element = True |
| 670 | else: |
no test coverage detected
searching dependent graphs…