| 70 | # --- Utilities --- |
| 71 | |
| 72 | def clean_html_content(soup: BeautifulSoup) -> BeautifulSoup: |
| 73 | |
| 74 | # Remove dangerous/useless tags |
| 75 | for tag in soup(['script', 'style', 'iframe', 'video', 'nav', 'form', 'button']): |
| 76 | tag.decompose() |
| 77 | |
| 78 | # Remove HTML comments |
| 79 | for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): |
| 80 | comment.extract() |
| 81 | |
| 82 | # Remove input tags |
| 83 | for tag in soup.find_all('input'): |
| 84 | tag.decompose() |
| 85 | |
| 86 | return soup |
| 87 | |
| 88 | |
| 89 | def extract_plain_text(soup: BeautifulSoup) -> str: |