(
self,
cache_dir: Optional[str] = None,
dataset_name: Optional[str] = None,
config_name: Optional[str] = None,
hash: Optional[str] = None,
base_path: Optional[str] = None,
info: Optional[DatasetInfo] = None,
features: Optional[Features] = None,
token: Optional[Union[bool, str]] = None,
repo_id: Optional[str] = None,
data_files: Optional[Union[str, list, dict, DataFilesDict]] = None,
data_dir: Optional[str] = None,
storage_options: Optional[dict] = None,
writer_batch_size: Optional[int] = None,
config_id: Optional[str] = None,
**config_kwargs,
)
| 306 | SLEEP_ON_THREADS_SHUTDOWNS = False |
| 307 | |
| 308 | def __init__( |
| 309 | self, |
| 310 | cache_dir: Optional[str] = None, |
| 311 | dataset_name: Optional[str] = None, |
| 312 | config_name: Optional[str] = None, |
| 313 | hash: Optional[str] = None, |
| 314 | base_path: Optional[str] = None, |
| 315 | info: Optional[DatasetInfo] = None, |
| 316 | features: Optional[Features] = None, |
| 317 | token: Optional[Union[bool, str]] = None, |
| 318 | repo_id: Optional[str] = None, |
| 319 | data_files: Optional[Union[str, list, dict, DataFilesDict]] = None, |
| 320 | data_dir: Optional[str] = None, |
| 321 | storage_options: Optional[dict] = None, |
| 322 | writer_batch_size: Optional[int] = None, |
| 323 | config_id: Optional[str] = None, |
| 324 | **config_kwargs, |
| 325 | ): |
| 326 | # DatasetBuilder name |
| 327 | self.name: str = camelcase_to_snakecase(self.__module__.split(".")[-1]) |
| 328 | self.hash: Optional[str] = hash |
| 329 | self.base_path = base_path |
| 330 | self.token = token |
| 331 | self.repo_id = repo_id |
| 332 | self.storage_options = storage_options or {} |
| 333 | self.dataset_name = camelcase_to_snakecase(dataset_name) if dataset_name else self.name |
| 334 | self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE |
| 335 | |
| 336 | if data_files is not None and not isinstance(data_files, DataFilesDict): |
| 337 | data_files = DataFilesDict.from_patterns( |
| 338 | sanitize_patterns(data_files), |
| 339 | base_path=base_path, |
| 340 | download_config=DownloadConfig(token=token, storage_options=self.storage_options), |
| 341 | ) |
| 342 | |
| 343 | # Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset |
| 344 | if "features" in inspect.signature(self.BUILDER_CONFIG_CLASS.__init__).parameters and features is not None: |
| 345 | config_kwargs["features"] = features |
| 346 | if data_files is not None: |
| 347 | config_kwargs["data_files"] = data_files |
| 348 | if data_dir is not None: |
| 349 | config_kwargs["data_dir"] = data_dir |
| 350 | self.config_kwargs = config_kwargs |
| 351 | self.config, self.config_id = self._create_builder_config( |
| 352 | config_name=config_name, |
| 353 | custom_features=features, |
| 354 | config_id=config_id, |
| 355 | **config_kwargs, |
| 356 | ) |
| 357 | |
| 358 | # prepare info: DatasetInfo are a standardized dataclass across all datasets |
| 359 | # Prefill datasetinfo |
| 360 | if info is None: |
| 361 | info = self._info() |
| 362 | info.builder_name = self.name |
| 363 | info.dataset_name = self.dataset_name |
| 364 | info.config_name = self.config.name |
| 365 | info.version = self.config.version |
nothing calls this directly
no test coverage detected