Get the module of a dataset loaded from the user's data files. The dataset builder module to use is inferred from the data files extensions.
| 407 | |
| 408 | |
| 409 | class LocalDatasetModuleFactory(_DatasetModuleFactory): |
| 410 | """Get the module of a dataset loaded from the user's data files. The dataset builder module to use is inferred |
| 411 | from the data files extensions.""" |
| 412 | |
| 413 | def __init__( |
| 414 | self, |
| 415 | path: str, |
| 416 | data_dir: Optional[str] = None, |
| 417 | data_files: Optional[Union[str, list, dict]] = None, |
| 418 | download_mode: Optional[Union[DownloadMode, str]] = None, |
| 419 | ): |
| 420 | if data_dir and os.path.isabs(data_dir): |
| 421 | raise ValueError(f"`data_dir` must be relative to a dataset directory's root: {path}") |
| 422 | |
| 423 | self.path = Path(path).as_posix() |
| 424 | self.name = Path(path).stem |
| 425 | self.data_files = data_files |
| 426 | self.data_dir = data_dir |
| 427 | self.download_mode = download_mode |
| 428 | |
| 429 | def get_module(self) -> DatasetModule: |
| 430 | readme_path = os.path.join(self.path, config.REPOCARD_FILENAME) |
| 431 | standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME) |
| 432 | dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData() |
| 433 | if os.path.exists(standalone_yaml_path): |
| 434 | with open(standalone_yaml_path, encoding="utf-8") as f: |
| 435 | standalone_yaml_data = yaml.safe_load(f.read()) |
| 436 | if standalone_yaml_data: |
| 437 | _dataset_card_data_dict = dataset_card_data.to_dict() |
| 438 | _dataset_card_data_dict.update(standalone_yaml_data) |
| 439 | dataset_card_data = DatasetCardData(**_dataset_card_data_dict) |
| 440 | metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) |
| 441 | dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data) |
| 442 | # we need a set of data files to find which dataset builder to use |
| 443 | # because we need to infer module name by files extensions |
| 444 | base_path = Path(self.path, self.data_dir or "").expanduser().resolve().as_posix() |
| 445 | if self.data_files is not None: |
| 446 | patterns = sanitize_patterns(self.data_files) |
| 447 | elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())): |
| 448 | patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"]) |
| 449 | else: |
| 450 | patterns = get_data_patterns(base_path) |
| 451 | data_files = DataFilesDict.from_patterns( |
| 452 | patterns, |
| 453 | base_path=base_path, |
| 454 | allowed_extensions=_ALL_ALLOWED_EXTENSIONS, |
| 455 | ) |
| 456 | module_name, default_builder_kwargs = infer_module_for_data_files( |
| 457 | data_files=data_files, |
| 458 | path=self.path, |
| 459 | ) |
| 460 | data_files = data_files.filter( |
| 461 | extensions=_MODULE_TO_EXTENSIONS[module_name] + _MODULE_TO_METADATA_EXTENSIONS[module_name], |
| 462 | file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name], |
| 463 | ) |
| 464 | module_path, _ = _PACKAGED_DATASETS_MODULES[module_name] |
| 465 | if metadata_configs: |
| 466 | builder_configs, default_config_name = create_builder_configs_from_metadata_configs( |
no outgoing calls