Get the module of a dataset loaded from data files of a dataset repository. The dataset builder module to use is inferred from the data files extensions.
| 557 | |
| 558 | |
| 559 | class HubDatasetModuleFactory(_DatasetModuleFactory): |
| 560 | """ |
| 561 | Get the module of a dataset loaded from data files of a dataset repository. |
| 562 | The dataset builder module to use is inferred from the data files extensions. |
| 563 | """ |
| 564 | |
| 565 | def __init__( |
| 566 | self, |
| 567 | name: str, |
| 568 | commit_hash: str, |
| 569 | data_dir: Optional[str] = None, |
| 570 | data_files: Optional[Union[str, list, dict]] = None, |
| 571 | download_config: Optional[DownloadConfig] = None, |
| 572 | download_mode: Optional[Union[DownloadMode, str]] = None, |
| 573 | use_exported_dataset_infos: bool = False, |
| 574 | ): |
| 575 | self.name = name |
| 576 | self.commit_hash = commit_hash |
| 577 | self.data_files = data_files |
| 578 | self.data_dir = data_dir |
| 579 | self.download_config = download_config or DownloadConfig() |
| 580 | self.download_mode = download_mode |
| 581 | self.use_exported_dataset_infos = use_exported_dataset_infos |
| 582 | increase_load_count(name) |
| 583 | |
| 584 | def get_module(self) -> DatasetModule: |
| 585 | # Get the Dataset Card and fix the revision in case there are new commits in the meantime |
| 586 | api = HfApi( |
| 587 | endpoint=config.HF_ENDPOINT, |
| 588 | token=self.download_config.token, |
| 589 | library_name="datasets", |
| 590 | library_version=__version__, |
| 591 | user_agent=get_datasets_user_agent(self.download_config.user_agent), |
| 592 | ) |
| 593 | try: |
| 594 | dataset_readme_path = api.hf_hub_download( |
| 595 | repo_id=self.name, |
| 596 | filename=config.REPOCARD_FILENAME, |
| 597 | repo_type="dataset", |
| 598 | revision=self.commit_hash, |
| 599 | proxies=self.download_config.proxies, |
| 600 | ) |
| 601 | dataset_card_data = DatasetCard.load(dataset_readme_path).data |
| 602 | except EntryNotFoundError: |
| 603 | dataset_card_data = DatasetCardData() |
| 604 | download_config = self.download_config.copy() |
| 605 | if download_config.download_desc is None: |
| 606 | download_config.download_desc = "Downloading standalone yaml" |
| 607 | try: |
| 608 | standalone_yaml_path = cached_path( |
| 609 | hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=self.commit_hash), |
| 610 | download_config=download_config, |
| 611 | ) |
| 612 | with open(standalone_yaml_path, encoding="utf-8") as f: |
| 613 | standalone_yaml_data = yaml.safe_load(f.read()) |
| 614 | if standalone_yaml_data: |
| 615 | _dataset_card_data_dict = dataset_card_data.to_dict() |
| 616 | _dataset_card_data_dict.update(standalone_yaml_data) |
no outgoing calls