Extract the iframes from an html page.
| 65 | |
| 66 | |
| 67 | class IframeParser(HTMLParser): |
| 68 | """Extract the iframes from an html page.""" |
| 69 | |
| 70 | def __init__(self): |
| 71 | super().__init__() |
| 72 | self.iframes = [] |
| 73 | |
| 74 | def handle_starttag(self, tag, attrs): |
| 75 | if tag == "iframe": |
| 76 | attrs = dict(attrs) |
| 77 | if "srcdoc" in attrs: |
| 78 | html_bytes = attrs["srcdoc"].encode() |
| 79 | elif "data-html" in attrs: # legacy |
| 80 | data_html = attrs["data-html"] |
| 81 | if "%" in data_html[:20]: |
| 82 | # newest branca version: data-html is percent-encoded |
| 83 | html_bytes = unquote(data_html).encode() |
| 84 | else: |
| 85 | # legacy branca version: data-html is base64 encoded |
| 86 | html_bytes = base64.b64decode(data_html) |
| 87 | else: # legacy |
| 88 | src = attrs["src"] |
| 89 | html_base64 = src.split(",")[-1] |
| 90 | html_bytes = base64.b64decode(html_base64) |
| 91 | self.iframes.append(html_bytes) |
no outgoing calls