| 77 | |
| 78 | |
| 79 | class Scraper: |
| 80 | pandoc_available = None |
| 81 | playwright_available = None |
| 82 | playwright_instructions_shown = False |
| 83 | |
| 84 | # Public API... |
| 85 | def __init__(self, print_error=None, playwright_available=None, verify_ssl=True): |
| 86 | """ |
| 87 | `print_error` - a function to call to print error/debug info. |
| 88 | `verify_ssl` - if False, disable SSL certificate verification when scraping. |
| 89 | """ |
| 90 | if print_error: |
| 91 | self.print_error = print_error |
| 92 | else: |
| 93 | self.print_error = print |
| 94 | |
| 95 | self.playwright_available = playwright_available |
| 96 | self.verify_ssl = verify_ssl |
| 97 | |
| 98 | def scrape(self, url): |
| 99 | """ |
| 100 | Scrape a url and turn it into readable markdown if it's HTML. |
| 101 | If it's plain text or non-HTML, return it as-is. |
| 102 | |
| 103 | `url` - the URL to scrape. |
| 104 | """ |
| 105 | |
| 106 | if self.playwright_available: |
| 107 | content, mime_type = self.scrape_with_playwright(url) |
| 108 | else: |
| 109 | content, mime_type = self.scrape_with_httpx(url) |
| 110 | |
| 111 | if not content: |
| 112 | self.print_error(f"Failed to retrieve content from {url}") |
| 113 | return None |
| 114 | |
| 115 | # Check if the content is HTML based on MIME type or content |
| 116 | if (mime_type and mime_type.startswith("text/html")) or ( |
| 117 | mime_type is None and self.looks_like_html(content) |
| 118 | ): |
| 119 | self.try_pandoc() |
| 120 | content = self.html_to_markdown(content) |
| 121 | |
| 122 | return content |
| 123 | |
| 124 | def looks_like_html(self, content): |
| 125 | """ |
| 126 | Check if the content looks like HTML. |
| 127 | """ |
| 128 | if isinstance(content, str): |
| 129 | # Check for common HTML tags |
| 130 | html_patterns = [ |
| 131 | r"<!DOCTYPE\s+html", |
| 132 | r"<html", |
| 133 | r"<head", |
| 134 | r"<body", |
| 135 | r"<div", |
| 136 | r"<p>", |
no outgoing calls