MCPcopy
hub / github.com/unclecode/crawl4ai / extract_metadata

Function extract_metadata

crawl4ai/utils.py:736–774  ·  view source on GitHub ↗
(html, soup = None)

Source from the content-addressed store, hash-verified

734 }
735
736def extract_metadata(html, soup = None):
737 metadata = {}
738
739 if not html:
740 return metadata
741
742 # Parse HTML content with BeautifulSoup
743 if not soup:
744 soup = BeautifulSoup(html, 'html.parser')
745
746 # Title
747 title_tag = soup.find('title')
748 metadata['title'] = title_tag.string if title_tag else None
749
750 # Meta description
751 description_tag = soup.find('meta', attrs={'name': 'description'})
752 metadata['description'] = description_tag['content'] if description_tag else None
753
754 # Meta keywords
755 keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
756 metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
757
758 # Meta author
759 author_tag = soup.find('meta', attrs={'name': 'author'})
760 metadata['author'] = author_tag['content'] if author_tag else None
761
762 # Open Graph metadata
763 og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
764 for tag in og_tags:
765 property_name = tag['property']
766 metadata[property_name] = tag['content']
767
768 # Twitter Card metadata
769 twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
770 for tag in twitter_tags:
771 property_name = tag['name']
772 metadata[property_name] = tag['content']
773
774 return metadata
775
776def extract_xml_tags(string):
777 tags = re.findall(r'<(\w+)>', string)

Callers 5

get_content_of_websiteFunction · 0.85
run_oldMethod · 0.85
process_htmlMethod · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…