Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the path to the cached file. Return: Local path (string) Raises: FileNotFoundError: in case of non-recoverable file (non-existent or no cache on
(
url,
cache_dir=None,
force_download=False,
user_agent=None,
use_etag=True,
token=None,
storage_options=None,
download_desc=None,
disable_tqdm=False,
)
| 331 | |
| 332 | |
| 333 | def get_from_cache( |
| 334 | url, |
| 335 | cache_dir=None, |
| 336 | force_download=False, |
| 337 | user_agent=None, |
| 338 | use_etag=True, |
| 339 | token=None, |
| 340 | storage_options=None, |
| 341 | download_desc=None, |
| 342 | disable_tqdm=False, |
| 343 | ) -> str: |
| 344 | """ |
| 345 | Given a URL, look for the corresponding file in the local cache. |
| 346 | If it's not there, download it. Then return the path to the cached file. |
| 347 | |
| 348 | Return: |
| 349 | Local path (string) |
| 350 | |
| 351 | Raises: |
| 352 | FileNotFoundError: in case of non-recoverable file |
| 353 | (non-existent or no cache on disk) |
| 354 | ConnectionError: in case of unreachable url |
| 355 | and no cache on disk |
| 356 | """ |
| 357 | if storage_options is None: |
| 358 | storage_options = {} |
| 359 | if cache_dir is None: |
| 360 | cache_dir = config.HF_DATASETS_CACHE |
| 361 | if isinstance(cache_dir, Path): |
| 362 | cache_dir = str(cache_dir) |
| 363 | |
| 364 | os.makedirs(cache_dir, exist_ok=True) |
| 365 | |
| 366 | response = None |
| 367 | etag = None |
| 368 | |
| 369 | # Try a first time to file the file on the local file system without eTag (None) |
| 370 | # if we don't ask for 'force_download' then we spare a request |
| 371 | filename = hash_url_to_filename(url, etag=None) |
| 372 | cache_path = os.path.join(cache_dir, filename) |
| 373 | |
| 374 | if os.path.exists(cache_path) and not force_download and not use_etag: |
| 375 | return cache_path |
| 376 | |
| 377 | # Prepare headers for authentication |
| 378 | headers = get_authentication_headers_for_url(url, token=token) |
| 379 | if user_agent is not None: |
| 380 | headers["user-agent"] = user_agent |
| 381 | |
| 382 | response = fsspec_head(url, storage_options=storage_options) |
| 383 | etag = (response.get("ETag", None) or response.get("etag", None)) if use_etag else None |
| 384 | |
| 385 | # Try a second time |
| 386 | filename = hash_url_to_filename(url, etag) |
| 387 | cache_path = os.path.join(cache_dir, filename) |
| 388 | |
| 389 | if os.path.exists(cache_path) and not force_download: |
| 390 | return cache_path |