MCPcopy
hub / github.com/ArchiveBox/ArchiveBox / get_html

Function get_html

archivebox/extractors/title.py:62–83  ·  view source on GitHub ↗

Try to find wget, singlefile and then dom files. If none is found, download the url again.

(link: Link, path: Path, timeout: int=TIMEOUT)

Source from the content-addressed store, hash-verified

60
61@enforce_types
62def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
63 """
64 Try to find wget, singlefile and then dom files.
65 If none is found, download the url again.
66 """
67 canonical = link.canonical_outputs()
68 abs_path = path.absolute()
69
70 # prefer chrome-generated DOM dump to singlefile as singlefile output often includes HUGE url(data:image/...base64) strings that crash parsers
71 sources = [canonical["dom_path"], canonical["singlefile_path"], canonical["wget_path"]]
72 document = None
73 for source in sources:
74 try:
75 with open(abs_path / source, "r", encoding="utf-8") as f:
76 document = f.read()
77 break
78 except (FileNotFoundError, TypeError):
79 continue
80 if document is None:
81 return download_url(link.url, timeout=timeout)
82 else:
83 return document
84
85@enforce_types
86def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:

Callers 3

save_titleFunction · 0.85
save_htmltotextFunction · 0.85
save_readabilityFunction · 0.85

Calls 2

download_urlFunction · 0.85
canonical_outputsMethod · 0.80

Tested by

no test coverage detected