download reader friendly version using @mozilla/readability
(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
| 37 | |
| 38 | @enforce_types |
| 39 | def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: |
| 40 | """download reader friendly version using @mozilla/readability""" |
| 41 | |
| 42 | out_dir = Path(out_dir or link.link_dir) |
| 43 | output_folder = out_dir.absolute() / "readability" |
| 44 | output = "readability" |
| 45 | |
| 46 | # Readability Docs: https://github.com/mozilla/readability |
| 47 | |
| 48 | status = 'succeeded' |
| 49 | # fake command to show the user so they have something to try debugging if get_html fails |
| 50 | cmd = [ |
| 51 | CURL_BINARY, |
| 52 | link.url |
| 53 | ] |
| 54 | readability_content = None |
| 55 | timer = TimedProgress(timeout, prefix=' ') |
| 56 | try: |
| 57 | document = get_html(link, out_dir) |
| 58 | temp_doc = NamedTemporaryFile(delete=False) |
| 59 | temp_doc.write(document.encode("utf-8")) |
| 60 | temp_doc.close() |
| 61 | |
| 62 | if not document or len(document) < 10: |
| 63 | raise ArchiveError('Readability could not find HTML to parse for article text') |
| 64 | |
| 65 | cmd = [ |
| 66 | DEPENDENCIES['READABILITY_BINARY']['path'], |
| 67 | temp_doc.name, |
| 68 | link.url, |
| 69 | ] |
| 70 | result = run(cmd, cwd=out_dir, timeout=timeout) |
| 71 | try: |
| 72 | result_json = json.loads(result.stdout) |
| 73 | assert result_json and 'content' in result_json, 'Readability output is not valid JSON' |
| 74 | except json.JSONDecodeError: |
| 75 | raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr) |
| 76 | |
| 77 | output_folder.mkdir(exist_ok=True) |
| 78 | readability_content = result_json.pop("textContent") |
| 79 | atomic_write(str(output_folder / "content.html"), result_json.pop("content")) |
| 80 | atomic_write(str(output_folder / "content.txt"), readability_content) |
| 81 | atomic_write(str(output_folder / "article.json"), result_json) |
| 82 | |
| 83 | output_tail = [ |
| 84 | line.strip() |
| 85 | for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:] |
| 86 | if line.strip() |
| 87 | ] |
| 88 | hints = ( |
| 89 | 'Got readability response code: {}.'.format(result.returncode), |
| 90 | *output_tail, |
| 91 | ) |
| 92 | |
| 93 | # Check for common failure cases |
| 94 | if (result.returncode > 0): |
| 95 | raise ArchiveError(f'Readability was not able to archive the page (status={result.returncode})', hints) |
| 96 | except (Exception, OSError) as err: |
nothing calls this directly
no test coverage detected