MCPcopy
hub / github.com/ArchiveBox/ArchiveBox / save_readability

Function save_readability

archivebox/extractors/readability.py:39–113  ·  view source on GitHub ↗

download reader friendly version using @mozilla/readability

(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)

Source from the content-addressed store, hash-verified

37
38@enforce_types
39def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
40 """download reader friendly version using @mozilla/readability"""
41
42 out_dir = Path(out_dir or link.link_dir)
43 output_folder = out_dir.absolute() / "readability"
44 output = "readability"
45
46 # Readability Docs: https://github.com/mozilla/readability
47
48 status = 'succeeded'
49 # fake command to show the user so they have something to try debugging if get_html fails
50 cmd = [
51 CURL_BINARY,
52 link.url
53 ]
54 readability_content = None
55 timer = TimedProgress(timeout, prefix=' ')
56 try:
57 document = get_html(link, out_dir)
58 temp_doc = NamedTemporaryFile(delete=False)
59 temp_doc.write(document.encode("utf-8"))
60 temp_doc.close()
61
62 if not document or len(document) < 10:
63 raise ArchiveError('Readability could not find HTML to parse for article text')
64
65 cmd = [
66 DEPENDENCIES['READABILITY_BINARY']['path'],
67 temp_doc.name,
68 link.url,
69 ]
70 result = run(cmd, cwd=out_dir, timeout=timeout)
71 try:
72 result_json = json.loads(result.stdout)
73 assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
74 except json.JSONDecodeError:
75 raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr)
76
77 output_folder.mkdir(exist_ok=True)
78 readability_content = result_json.pop("textContent")
79 atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
80 atomic_write(str(output_folder / "content.txt"), readability_content)
81 atomic_write(str(output_folder / "article.json"), result_json)
82
83 output_tail = [
84 line.strip()
85 for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
86 if line.strip()
87 ]
88 hints = (
89 'Got readability response code: {}.'.format(result.returncode),
90 *output_tail,
91 )
92
93 # Check for common failure cases
94 if (result.returncode > 0):
95 raise ArchiveError(f'Readability was not able to archive the page (status={result.returncode})', hints)
96 except (Exception, OSError) as err:

Callers

nothing calls this directly

Calls 7

endMethod · 0.95
TimedProgressClass · 0.85
get_htmlFunction · 0.85
ArchiveErrorClass · 0.85
atomic_writeFunction · 0.85
runFunction · 0.50
ArchiveResultClass · 0.50

Tested by

no test coverage detected