Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and then return the path. Return: Local path (
(
url_or_filename,
download_config=None,
**download_kwargs,
)
| 132 | |
| 133 | |
| 134 | def cached_path( |
| 135 | url_or_filename, |
| 136 | download_config=None, |
| 137 | **download_kwargs, |
| 138 | ) -> str: |
| 139 | """ |
| 140 | Given something that might be a URL (or might be a local path), |
| 141 | determine which. If it's a URL, download the file and cache it, and |
| 142 | return the path to the cached file. If it's already a local path, |
| 143 | make sure the file exists and then return the path. |
| 144 | |
| 145 | Return: |
| 146 | Local path (string) |
| 147 | |
| 148 | Raises: |
| 149 | FileNotFoundError: in case of non-recoverable file |
| 150 | (non-existent or no cache on disk) |
| 151 | ConnectionError: in case of unreachable url |
| 152 | and no cache on disk |
| 153 | ValueError: if it couldn't parse the url or filename correctly |
| 154 | httpx.NetworkError or requests.exceptions.ConnectionError: in case of internet connection issue |
| 155 | """ |
| 156 | if download_config is None: |
| 157 | download_config = DownloadConfig(**download_kwargs) |
| 158 | |
| 159 | cache_dir = download_config.cache_dir or config.DOWNLOADED_DATASETS_PATH |
| 160 | if isinstance(cache_dir, Path): |
| 161 | cache_dir = str(cache_dir) |
| 162 | if isinstance(url_or_filename, Path): |
| 163 | url_or_filename = str(url_or_filename) |
| 164 | |
| 165 | # Convert fsspec URL in the format "file://local/path" to "local/path" |
| 166 | if can_be_local(url_or_filename): |
| 167 | url_or_filename = strip_protocol(url_or_filename) |
| 168 | |
| 169 | if is_remote_url(url_or_filename): |
| 170 | # URL, so get it from the cache (downloading if necessary) |
| 171 | url_or_filename, storage_options = _prepare_path_and_storage_options( |
| 172 | url_or_filename, download_config=download_config |
| 173 | ) |
| 174 | # Download files from Hugging Face. |
| 175 | # Note: no need to check for https://huggingface.co file URLs since _prepare_path_and_storage_options |
| 176 | # prepares Hugging Face HTTP URLs as hf:// paths already |
| 177 | if url_or_filename.startswith("hf://") and not url_or_filename.startswith("hf://buckets/"): |
| 178 | resolved_path = huggingface_hub.HfFileSystem( |
| 179 | endpoint=config.HF_ENDPOINT, token=download_config.token |
| 180 | ).resolve_path(url_or_filename) |
| 181 | try: |
| 182 | output_path = huggingface_hub.HfApi( |
| 183 | endpoint=config.HF_ENDPOINT, |
| 184 | token=download_config.token, |
| 185 | library_name="datasets", |
| 186 | library_version=__version__, |
| 187 | user_agent=get_datasets_user_agent(download_config.user_agent), |
| 188 | ).hf_hub_download( |
| 189 | repo_id=resolved_path.repo_id, |
| 190 | repo_type=resolved_path.repo_type, |
| 191 | revision=resolved_path.revision, |