MCPcopy Index your code
hub / github.com/ScrapeGraphAI/Scrapegraph-ai / cleanup_html

Function cleanup_html

scrapegraphai/utils/cleanup_html.py:48–101  ·  view source on GitHub ↗

Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content. Args: html_content (str): The HTML content to be processed. Returns: str: A string combining the parsed title and the minified body content.

(html_content: str, base_url: str)

Source from the content-addressed store, hash-verified

46
47
48def cleanup_html(html_content: str, base_url: str) -> str:
49 """
50 Processes HTML content by removing unnecessary tags,
51 minifying the HTML, and extracting the title and body content.
52
53 Args:
54 html_content (str): The HTML content to be processed.
55
56 Returns:
57 str: A string combining the parsed title and the minified body content.
58 If no body content is found, it indicates so.
59
60 Example:
61 >>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
62 >>> remover(html_content)
63 'Title: Example, Body: <body><p>Hello World!</p></body>'
64
65 This function is particularly useful for preparing HTML content for
66 environments where bandwidth usage needs to be minimized.
67 """
68
69 soup = BeautifulSoup(html_content, "html.parser")
70
71 title_tag = soup.find("title")
72 title = title_tag.get_text() if title_tag else ""
73
74 script_content = extract_from_script_tags(soup)
75
76 for tag in soup.find_all("style"):
77 tag.extract()
78
79 link_urls = [
80 urljoin(base_url, link["href"]) for link in soup.find_all("a", href=True)
81 ]
82
83 images = soup.find_all("img")
84 image_urls = []
85 for image in images:
86 if "src" in image.attrs:
87 if "http" not in image["src"]:
88 image_urls.append(urljoin(base_url, image["src"]))
89 else:
90 image_urls.append(image["src"])
91
92 body_content = soup.find("body")
93 if body_content:
94 minimized_body = minify(str(body_content))
95 return title, minimized_body, link_urls, image_urls, script_content
96
97 else:
98 raise ValueError(
99 f"""No HTML body content found, please try setting the 'headless'
100 flag to False in the graph configuration. HTML content: {html_content}"""
101 )
102
103
104def minify_html(html):

Callers 3

handle_web_sourceMethod · 0.90

Calls 1

extract_from_script_tagsFunction · 0.85

Tested by 2