Function extract_metadata

crawl4ai/utils.py:736–774 · view source on GitHub ↗

(html, soup = None)

Source from the content-addressed store, hash-verified

734	}
735
736	def extract_metadata(html, soup = None):
737	metadata = {}
738
739	if not html:
740	return metadata
741
742	# Parse HTML content with BeautifulSoup
743	if not soup:
744	soup = BeautifulSoup(html, 'html.parser')
745
746	# Title
747	title_tag = soup.find('title')
748	metadata['title'] = title_tag.string if title_tag else None
749
750	# Meta description
751	description_tag = soup.find('meta', attrs={'name': 'description'})
752	metadata['description'] = description_tag['content'] if description_tag else None
753
754	# Meta keywords
755	keywords_tag = soup.find('meta', attrs={'name': 'keywords'})
756	metadata['keywords'] = keywords_tag['content'] if keywords_tag else None
757
758	# Meta author
759	author_tag = soup.find('meta', attrs={'name': 'author'})
760	metadata['author'] = author_tag['content'] if author_tag else None
761
762	# Open Graph metadata
763	og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')})
764	for tag in og_tags:
765	property_name = tag['property']
766	metadata[property_name] = tag['content']
767
768	# Twitter Card metadata
769	twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')})
770	for tag in twitter_tags:
771	property_name = tag['name']
772	metadata[property_name] = tag['content']
773
774	return metadata
775
776	def extract_xml_tags(string):
777	tags = re.findall(r'<(\w+)>', string)

get_content_of_websiteFunction · 0.85

get_content_of_website_optimizedFunction · 0.85

run_oldMethod · 0.85

process_htmlMethod · 0.85

_get_content_of_website_optimizedMethod · 0.85

no outgoing calls

no test coverage detected

searching dependent graphs…