MCPcopy
hub / github.com/ArchiveBox/ArchiveBox / save_singlefile

Function save_singlefile

archivebox/extractors/singlefile.py:39–114  ·  view source on GitHub ↗

download full site using single-file

(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)

Source from the content-addressed store, hash-verified

37
38@enforce_types
39def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
40 """download full site using single-file"""
41
42 out_dir = out_dir or Path(link.link_dir)
43 output = "singlefile.html"
44
45 browser_args = chrome_args(CHROME_TIMEOUT=0)
46
47 # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
48 browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
49 options = [
50 *SINGLEFILE_ARGS,
51 '--browser-executable-path={}'.format(CHROME_BINARY),
52 browser_args,
53 ]
54
55 # Deduplicate options (single-file doesn't like when you use the same option two times)
56 #
57 # NOTE: Options names that come first clobber conflicting names that come later
58 # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
59 # specificity, therefore the user sets it with a lot intent, therefore it should take precedence
60 # kind of like the ergonomic principle of lexical scope in programming languages.
61 seen_option_names = []
62 def test_seen(argument):
63 option_name = argument.split("=")[0]
64 if option_name in seen_option_names:
65 return False
66 else:
67 seen_option_names.append(option_name)
68 return True
69 deduped_options = list(filter(test_seen, options))
70
71 cmd = [
72 DEPENDENCIES['SINGLEFILE_BINARY']['path'],
73 *deduped_options,
74 link.url,
75 output,
76 ]
77
78 status = 'succeeded'
79 timer = TimedProgress(timeout, prefix=' ')
80 try:
81 result = run(cmd, cwd=str(out_dir), timeout=timeout)
82
83 # parse out number of files downloaded from last line of stderr:
84 # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
85 output_tail = [
86 line.strip()
87 for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
88 if line.strip()
89 ]
90 hints = (
91 'Got single-file response code: {}.'.format(result.returncode),
92 *output_tail,
93 )
94
95 # Check for common failure cases
96 if (result.returncode > 0) or not (out_dir / output).is_file():

Callers

nothing calls this directly

Calls 7

endMethod · 0.95
chrome_argsFunction · 0.85
TimedProgressClass · 0.85
ArchiveErrorClass · 0.85
chmod_fileFunction · 0.85
runFunction · 0.50
ArchiveResultClass · 0.50

Tested by

no test coverage detected