Try to find wget, singlefile and then dom files. If none is found, download the url again.
(link: Link, path: Path, timeout: int=TIMEOUT)
| 60 | |
| 61 | @enforce_types |
| 62 | def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str: |
| 63 | """ |
| 64 | Try to find wget, singlefile and then dom files. |
| 65 | If none is found, download the url again. |
| 66 | """ |
| 67 | canonical = link.canonical_outputs() |
| 68 | abs_path = path.absolute() |
| 69 | |
| 70 | # prefer chrome-generated DOM dump to singlefile as singlefile output often includes HUGE url(data:image/...base64) strings that crash parsers |
| 71 | sources = [canonical["dom_path"], canonical["singlefile_path"], canonical["wget_path"]] |
| 72 | document = None |
| 73 | for source in sources: |
| 74 | try: |
| 75 | with open(abs_path / source, "r", encoding="utf-8") as f: |
| 76 | document = f.read() |
| 77 | break |
| 78 | except (FileNotFoundError, TypeError): |
| 79 | continue |
| 80 | if document is None: |
| 81 | return download_url(link.url, timeout=timeout) |
| 82 | else: |
| 83 | return document |
| 84 | |
| 85 | @enforce_types |
| 86 | def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: |
no test coverage detected