MCPcopy
hub / github.com/unclecode/crawl4ai / process_element

Function process_element

crawl4ai/utils.py:613–686  ·  view source on GitHub ↗
(element: element.PageElement)

Source from the content-addressed store, hash-verified

611 }
612
613 def process_element(element: element.PageElement) -> bool:
614 try:
615 if isinstance(element, NavigableString):
616 if isinstance(element, Comment):
617 element.extract()
618 return False
619
620 if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
621 element.decompose()
622 return False
623
624 keep_element = False
625
626 if element.name == 'a' and element.get('href'):
627 href = element['href']
628 url_base = url.split('/')[2]
629 link_data = {'href': href, 'text': element.get_text()}
630 if href.startswith('http') and url_base not in href:
631 links['external'].append(link_data)
632 else:
633 links['internal'].append(link_data)
634 keep_element = True
635
636 elif element.name == 'img':
637 return True # Always keep image elements
638
639 elif element.name in ['video', 'audio']:
640 media[f"{element.name}s"].append({
641 'src': element.get('src'),
642 'alt': element.get('alt'),
643 'type': element.name,
644 'description': find_closest_parent_with_useful_text(element)
645 })
646 source_tags = element.find_all('source')
647 for source_tag in source_tags:
648 media[f"{element.name}s"].append({
649 'src': source_tag.get('src'),
650 'alt': element.get('alt'),
651 'type': element.name,
652 'description': find_closest_parent_with_useful_text(element)
653 })
654 return True # Always keep video and audio elements
655
656 if element.name != 'pre':
657 if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
658 if kwargs.get('only_text', False):
659 element.replace_with(element.get_text())
660 else:
661 element.unwrap()
662 elif element.name != 'img':
663 element.attrs = {}
664
665 # Process children
666 for child in list(element.children):
667 if isinstance(child, NavigableString) and not isinstance(child, Comment):
668 if len(child.strip()) > 0:
669 keep_element = True
670 else:

Calls 2

extractMethod · 0.45

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…