Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content. Args: html_content (str): The HTML content to be processed. Returns: str: A string combining the parsed title and the minified body content.
(html_content: str, base_url: str)
| 46 | |
| 47 | |
| 48 | def cleanup_html(html_content: str, base_url: str) -> str: |
| 49 | """ |
| 50 | Processes HTML content by removing unnecessary tags, |
| 51 | minifying the HTML, and extracting the title and body content. |
| 52 | |
| 53 | Args: |
| 54 | html_content (str): The HTML content to be processed. |
| 55 | |
| 56 | Returns: |
| 57 | str: A string combining the parsed title and the minified body content. |
| 58 | If no body content is found, it indicates so. |
| 59 | |
| 60 | Example: |
| 61 | >>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>" |
| 62 | >>> remover(html_content) |
| 63 | 'Title: Example, Body: <body><p>Hello World!</p></body>' |
| 64 | |
| 65 | This function is particularly useful for preparing HTML content for |
| 66 | environments where bandwidth usage needs to be minimized. |
| 67 | """ |
| 68 | |
| 69 | soup = BeautifulSoup(html_content, "html.parser") |
| 70 | |
| 71 | title_tag = soup.find("title") |
| 72 | title = title_tag.get_text() if title_tag else "" |
| 73 | |
| 74 | script_content = extract_from_script_tags(soup) |
| 75 | |
| 76 | for tag in soup.find_all("style"): |
| 77 | tag.extract() |
| 78 | |
| 79 | link_urls = [ |
| 80 | urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True) |
| 81 | ] |
| 82 | |
| 83 | images = soup.find_all("img") |
| 84 | image_urls = [] |
| 85 | for image in images: |
| 86 | if "src" in image.attrs: |
| 87 | if "http" not in image["src"]: |
| 88 | image_urls.append(urljoin(base_url, image["src"])) |
| 89 | else: |
| 90 | image_urls.append(image["src"]) |
| 91 | |
| 92 | body_content = soup.find("body") |
| 93 | if body_content: |
| 94 | minimized_body = minify(str(body_content)) |
| 95 | return title, minimized_body, link_urls, image_urls, script_content |
| 96 | |
| 97 | else: |
| 98 | raise ValueError( |
| 99 | f"""No HTML body content found, please try setting the 'headless' |
| 100 | flag to False in the graph configuration. HTML content: {html_content}""" |
| 101 | ) |
| 102 | |
| 103 | |
| 104 | def minify_html(html): |