| 427 | self.download_mode = download_mode |
| 428 | |
| 429 | def get_module(self) -> DatasetModule: |
| 430 | readme_path = os.path.join(self.path, config.REPOCARD_FILENAME) |
| 431 | standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME) |
| 432 | dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData() |
| 433 | if os.path.exists(standalone_yaml_path): |
| 434 | with open(standalone_yaml_path, encoding="utf-8") as f: |
| 435 | standalone_yaml_data = yaml.safe_load(f.read()) |
| 436 | if standalone_yaml_data: |
| 437 | _dataset_card_data_dict = dataset_card_data.to_dict() |
| 438 | _dataset_card_data_dict.update(standalone_yaml_data) |
| 439 | dataset_card_data = DatasetCardData(**_dataset_card_data_dict) |
| 440 | metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) |
| 441 | dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data) |
| 442 | # we need a set of data files to find which dataset builder to use |
| 443 | # because we need to infer module name by files extensions |
| 444 | base_path = Path(self.path, self.data_dir or "").expanduser().resolve().as_posix() |
| 445 | if self.data_files is not None: |
| 446 | patterns = sanitize_patterns(self.data_files) |
| 447 | elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())): |
| 448 | patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"]) |
| 449 | else: |
| 450 | patterns = get_data_patterns(base_path) |
| 451 | data_files = DataFilesDict.from_patterns( |
| 452 | patterns, |
| 453 | base_path=base_path, |
| 454 | allowed_extensions=_ALL_ALLOWED_EXTENSIONS, |
| 455 | ) |
| 456 | module_name, default_builder_kwargs = infer_module_for_data_files( |
| 457 | data_files=data_files, |
| 458 | path=self.path, |
| 459 | ) |
| 460 | data_files = data_files.filter( |
| 461 | extensions=_MODULE_TO_EXTENSIONS[module_name] + _MODULE_TO_METADATA_EXTENSIONS[module_name], |
| 462 | file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name], |
| 463 | ) |
| 464 | module_path, _ = _PACKAGED_DATASETS_MODULES[module_name] |
| 465 | if metadata_configs: |
| 466 | builder_configs, default_config_name = create_builder_configs_from_metadata_configs( |
| 467 | module_path, |
| 468 | metadata_configs, |
| 469 | base_path=base_path, |
| 470 | default_builder_kwargs=default_builder_kwargs, |
| 471 | ) |
| 472 | else: |
| 473 | builder_configs: list[BuilderConfig] = [ |
| 474 | import_main_class(module_path).BUILDER_CONFIG_CLASS( |
| 475 | data_files=data_files, |
| 476 | **default_builder_kwargs, |
| 477 | ) |
| 478 | ] |
| 479 | default_config_name = None |
| 480 | builder_kwargs = { |
| 481 | "base_path": self.path, |
| 482 | "dataset_name": camelcase_to_snakecase(Path(self.path).name), |
| 483 | } |
| 484 | if self.data_dir: |
| 485 | builder_kwargs["data_files"] = data_files |
| 486 | # this file is deprecated and was created automatically in old versions of push_to_hub |