Scrape a url and turn it into readable markdown if it's HTML. If it's plain text or non-HTML, return it as-is. `url` - the URL to scrape.
(self, url)
| 96 | self.verify_ssl = verify_ssl |
| 97 | |
| 98 | def scrape(self, url): |
| 99 | """ |
| 100 | Scrape a url and turn it into readable markdown if it's HTML. |
| 101 | If it's plain text or non-HTML, return it as-is. |
| 102 | |
| 103 | `url` - the URL to scrape. |
| 104 | """ |
| 105 | |
| 106 | if self.playwright_available: |
| 107 | content, mime_type = self.scrape_with_playwright(url) |
| 108 | else: |
| 109 | content, mime_type = self.scrape_with_httpx(url) |
| 110 | |
| 111 | if not content: |
| 112 | self.print_error(f"Failed to retrieve content from {url}") |
| 113 | return None |
| 114 | |
| 115 | # Check if the content is HTML based on MIME type or content |
| 116 | if (mime_type and mime_type.startswith("text/html")) or ( |
| 117 | mime_type is None and self.looks_like_html(content) |
| 118 | ): |
| 119 | self.try_pandoc() |
| 120 | content = self.html_to_markdown(content) |
| 121 | |
| 122 | return content |
| 123 | |
| 124 | def looks_like_html(self, content): |
| 125 | """ |