calculate the path to the wgetted .html file, since wget may adjust some paths to be different than the base_url path. See docs on wget --adjust-extension (-E)
(link: Link)
| 127 | |
| 128 | @enforce_types |
| 129 | def wget_output_path(link: Link) -> Optional[str]: |
| 130 | """calculate the path to the wgetted .html file, since wget may |
| 131 | adjust some paths to be different than the base_url path. |
| 132 | |
| 133 | See docs on wget --adjust-extension (-E) |
| 134 | """ |
| 135 | |
| 136 | # Wget downloads can save in a number of different ways depending on the url: |
| 137 | # https://example.com |
| 138 | # > example.com/index.html |
| 139 | # https://example.com?v=zzVa_tX1OiI |
| 140 | # > example.com/index.html?v=zzVa_tX1OiI.html |
| 141 | # https://www.example.com/?v=zzVa_tX1OiI |
| 142 | # > example.com/index.html?v=zzVa_tX1OiI.html |
| 143 | |
| 144 | # https://example.com/abc |
| 145 | # > example.com/abc.html |
| 146 | # https://example.com/abc/ |
| 147 | # > example.com/abc/index.html |
| 148 | # https://example.com/abc?v=zzVa_tX1OiI.html |
| 149 | # > example.com/abc?v=zzVa_tX1OiI.html |
| 150 | # https://example.com/abc/?v=zzVa_tX1OiI.html |
| 151 | # > example.com/abc/index.html?v=zzVa_tX1OiI.html |
| 152 | |
| 153 | # https://example.com/abc/test.html |
| 154 | # > example.com/abc/test.html |
| 155 | # https://example.com/abc/test?v=zzVa_tX1OiI |
| 156 | # > example.com/abc/test?v=zzVa_tX1OiI.html |
| 157 | # https://example.com/abc/test/?v=zzVa_tX1OiI |
| 158 | # > example.com/abc/test/index.html?v=zzVa_tX1OiI.html |
| 159 | |
| 160 | # There's also lots of complexity around how the urlencoding and renaming |
| 161 | # is done for pages with query and hash fragments or extensions like shtml / htm / php / etc |
| 162 | |
| 163 | # Since the wget algorithm for -E (appending .html) is incredibly complex |
| 164 | # and there's no way to get the computed output path from wget |
| 165 | # in order to avoid having to reverse-engineer how they calculate it, |
| 166 | # we just look in the output folder read the filename wget used from the filesystem |
| 167 | full_path = without_fragment(without_query(path(link.url))).strip('/') |
| 168 | search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path) |
| 169 | for _ in range(4): |
| 170 | if search_dir.exists(): |
| 171 | if search_dir.is_dir(): |
| 172 | html_files = [ |
| 173 | f for f in search_dir.iterdir() |
| 174 | if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M) |
| 175 | ] |
| 176 | if html_files: |
| 177 | return str(html_files[0].relative_to(link.link_dir)) |
| 178 | |
| 179 | # sometimes wget'd URLs have no ext and return non-html |
| 180 | # e.g. /some/example/rss/all -> some RSS XML content) |
| 181 | # /some/other/url.o4g -> some binary unrecognized ext) |
| 182 | # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all |
| 183 | last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1]) |
| 184 | for file_present in search_dir.iterdir(): |
| 185 | if file_present == last_part_of_url: |
| 186 | return str((search_dir / file_present).relative_to(link.link_dir)) |
no outgoing calls
no test coverage detected