MCPcopy
hub / github.com/ArchiveBox/ArchiveBox / save_wget

Function save_wget

archivebox/extractors/wget.py:47–125  ·  view source on GitHub ↗

download full site using wget

(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)

Source from the content-addressed store, hash-verified

45
46@enforce_types
47def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
48 """download full site using wget"""
49
50 out_dir = out_dir or link.link_dir
51 if SAVE_WARC:
52 warc_dir = out_dir / "warc"
53 warc_dir.mkdir(exist_ok=True)
54 warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
55
56 # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
57 output: ArchiveOutput = None
58 cmd = [
59 WGET_BINARY,
60 # '--server-response', # print headers for better error parsing
61 *WGET_ARGS,
62 '--timeout={}'.format(timeout),
63 *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
64 *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []),
65 *(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
66 *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
67 *(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []),
68 *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
69 *([] if SAVE_WARC else ['--timestamping']),
70 *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
71 link.url,
72 ]
73
74 status = 'succeeded'
75 timer = TimedProgress(timeout, prefix=' ')
76 try:
77 result = run(cmd, cwd=str(out_dir), timeout=timeout)
78 output = wget_output_path(link)
79
80 # parse out number of files downloaded from last line of stderr:
81 # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
82 output_tail = [
83 line.strip()
84 for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
85 if line.strip()
86 ]
87 files_downloaded = (
88 int(output_tail[-1].strip().split(' ', 2)[1] or 0)
89 if 'Downloaded:' in output_tail[-1]
90 else 0
91 )
92 hints = (
93 'Got wget response code: {}.'.format(result.returncode),
94 *output_tail,
95 )
96
97 # Check for common failure cases
98 if (result.returncode > 0 and files_downloaded < 1) or output is None:
99 if b'403: Forbidden' in result.stderr:
100 raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
101 if b'404: Not Found' in result.stderr:
102 raise ArchiveError('404 Not Found', hints)
103 if b'ERROR 500: Internal Server Error' in result.stderr:
104 raise ArchiveError('500 Internal Server Error', hints)

Callers

nothing calls this directly

Calls 7

endMethod · 0.95
TimedProgressClass · 0.85
wget_output_pathFunction · 0.85
ArchiveErrorClass · 0.85
chmod_fileFunction · 0.85
runFunction · 0.50
ArchiveResultClass · 0.50

Tested by

no test coverage detected