(html, soup = None)
| 734 | } |
| 735 | |
| 736 | def extract_metadata(html, soup = None): |
| 737 | metadata = {} |
| 738 | |
| 739 | if not html: |
| 740 | return metadata |
| 741 | |
| 742 | # Parse HTML content with BeautifulSoup |
| 743 | if not soup: |
| 744 | soup = BeautifulSoup(html, 'html.parser') |
| 745 | |
| 746 | # Title |
| 747 | title_tag = soup.find('title') |
| 748 | metadata['title'] = title_tag.string if title_tag else None |
| 749 | |
| 750 | # Meta description |
| 751 | description_tag = soup.find('meta', attrs={'name': 'description'}) |
| 752 | metadata['description'] = description_tag['content'] if description_tag else None |
| 753 | |
| 754 | # Meta keywords |
| 755 | keywords_tag = soup.find('meta', attrs={'name': 'keywords'}) |
| 756 | metadata['keywords'] = keywords_tag['content'] if keywords_tag else None |
| 757 | |
| 758 | # Meta author |
| 759 | author_tag = soup.find('meta', attrs={'name': 'author'}) |
| 760 | metadata['author'] = author_tag['content'] if author_tag else None |
| 761 | |
| 762 | # Open Graph metadata |
| 763 | og_tags = soup.find_all('meta', attrs={'property': lambda value: value and value.startswith('og:')}) |
| 764 | for tag in og_tags: |
| 765 | property_name = tag['property'] |
| 766 | metadata[property_name] = tag['content'] |
| 767 | |
| 768 | # Twitter Card metadata |
| 769 | twitter_tags = soup.find_all('meta', attrs={'name': lambda value: value and value.startswith('twitter:')}) |
| 770 | for tag in twitter_tags: |
| 771 | property_name = tag['name'] |
| 772 | metadata[property_name] = tag['content'] |
| 773 | |
| 774 | return metadata |
| 775 | |
| 776 | def extract_xml_tags(string): |
| 777 | tags = re.findall(r'<(\w+)>', string) |
no outgoing calls
no test coverage detected
searching dependent graphs…