Download the given URL and return a binary-mode file object to access the data.
(url: str, cache_dir: str = None, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False, cache: bool = True)
| 383 | |
| 384 | |
| 385 | def open_url(url: str, cache_dir: str = None, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False, cache: bool = True) -> Any: |
| 386 | """Download the given URL and return a binary-mode file object to access the data.""" |
| 387 | assert num_attempts >= 1 |
| 388 | assert not (return_filename and (not cache)) |
| 389 | |
| 390 | # Doesn't look like an URL scheme so interpret it as a local filename. |
| 391 | if not re.match('^[a-z]+://', url): |
| 392 | return url if return_filename else open(url, "rb") |
| 393 | |
| 394 | # Handle file URLs. This code handles unusual file:// patterns that |
| 395 | # arise on Windows: |
| 396 | # |
| 397 | # file:///c:/foo.txt |
| 398 | # |
| 399 | # which would translate to a local '/c:/foo.txt' filename that's |
| 400 | # invalid. Drop the forward slash for such pathnames. |
| 401 | # |
| 402 | # If you touch this code path, you should test it on both Linux and |
| 403 | # Windows. |
| 404 | # |
| 405 | # Some internet resources suggest using urllib.request.url2pathname() but |
| 406 | # but that converts forward slashes to backslashes and this causes |
| 407 | # its own set of problems. |
| 408 | if url.startswith('file://'): |
| 409 | filename = urllib.parse.urlparse(url).path |
| 410 | if re.match(r'^/[a-zA-Z]:', filename): |
| 411 | filename = filename[1:] |
| 412 | return filename if return_filename else open(filename, "rb") |
| 413 | |
| 414 | assert is_url(url) |
| 415 | |
| 416 | # Lookup from cache. |
| 417 | if cache_dir is None: |
| 418 | cache_dir = make_cache_dir_path('downloads') |
| 419 | |
| 420 | url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest() |
| 421 | if cache: |
| 422 | cache_files = glob.glob(os.path.join(cache_dir, url_md5 + "_*")) |
| 423 | if len(cache_files) == 1: |
| 424 | filename = cache_files[0] |
| 425 | return filename if return_filename else open(filename, "rb") |
| 426 | |
| 427 | # Download. |
| 428 | url_name = None |
| 429 | url_data = None |
| 430 | with requests.Session() as session: |
| 431 | if verbose: |
| 432 | print("Downloading %s ..." % url, end="", flush=True) |
| 433 | for attempts_left in reversed(range(num_attempts)): |
| 434 | try: |
| 435 | with session.get(url) as res: |
| 436 | res.raise_for_status() |
| 437 | if len(res.content) == 0: |
| 438 | raise IOError("No data received") |
| 439 | |
| 440 | if len(res.content) < 8192: |
| 441 | content_str = res.content.decode("utf-8") |
| 442 | if "download_warning" in res.headers.get("Set-Cookie", ""): |
nothing calls this directly
no test coverage detected