download full site using wget
(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
| 45 | |
| 46 | @enforce_types |
| 47 | def save_wget(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: |
| 48 | """download full site using wget""" |
| 49 | |
| 50 | out_dir = out_dir or link.link_dir |
| 51 | if SAVE_WARC: |
| 52 | warc_dir = out_dir / "warc" |
| 53 | warc_dir.mkdir(exist_ok=True) |
| 54 | warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp())) |
| 55 | |
| 56 | # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html |
| 57 | output: ArchiveOutput = None |
| 58 | cmd = [ |
| 59 | WGET_BINARY, |
| 60 | # '--server-response', # print headers for better error parsing |
| 61 | *WGET_ARGS, |
| 62 | '--timeout={}'.format(timeout), |
| 63 | *(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []), |
| 64 | *(['--warc-file={}'.format(str(warc_path))] if SAVE_WARC else []), |
| 65 | *(['--page-requisites'] if SAVE_WGET_REQUISITES else []), |
| 66 | *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []), |
| 67 | *(['--load-cookies', str(COOKIES_FILE)] if COOKIES_FILE else []), |
| 68 | *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []), |
| 69 | *([] if SAVE_WARC else ['--timestamping']), |
| 70 | *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']), |
| 71 | link.url, |
| 72 | ] |
| 73 | |
| 74 | status = 'succeeded' |
| 75 | timer = TimedProgress(timeout, prefix=' ') |
| 76 | try: |
| 77 | result = run(cmd, cwd=str(out_dir), timeout=timeout) |
| 78 | output = wget_output_path(link) |
| 79 | |
| 80 | # parse out number of files downloaded from last line of stderr: |
| 81 | # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" |
| 82 | output_tail = [ |
| 83 | line.strip() |
| 84 | for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] |
| 85 | if line.strip() |
| 86 | ] |
| 87 | files_downloaded = ( |
| 88 | int(output_tail[-1].strip().split(' ', 2)[1] or 0) |
| 89 | if 'Downloaded:' in output_tail[-1] |
| 90 | else 0 |
| 91 | ) |
| 92 | hints = ( |
| 93 | 'Got wget response code: {}.'.format(result.returncode), |
| 94 | *output_tail, |
| 95 | ) |
| 96 | |
| 97 | # Check for common failure cases |
| 98 | if (result.returncode > 0 and files_downloaded < 1) or output is None: |
| 99 | if b'403: Forbidden' in result.stderr: |
| 100 | raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints) |
| 101 | if b'404: Not Found' in result.stderr: |
| 102 | raise ArchiveError('404 Not Found', hints) |
| 103 | if b'ERROR 500: Internal Server Error' in result.stderr: |
| 104 | raise ArchiveError('500 Internal Server Error', hints) |
nothing calls this directly
no test coverage detected