| 582 | increase_load_count(name) |
| 583 | |
| 584 | def get_module(self) -> DatasetModule: |
| 585 | # Get the Dataset Card and fix the revision in case there are new commits in the meantime |
| 586 | api = HfApi( |
| 587 | endpoint=config.HF_ENDPOINT, |
| 588 | token=self.download_config.token, |
| 589 | library_name="datasets", |
| 590 | library_version=__version__, |
| 591 | user_agent=get_datasets_user_agent(self.download_config.user_agent), |
| 592 | ) |
| 593 | try: |
| 594 | dataset_readme_path = api.hf_hub_download( |
| 595 | repo_id=self.name, |
| 596 | filename=config.REPOCARD_FILENAME, |
| 597 | repo_type="dataset", |
| 598 | revision=self.commit_hash, |
| 599 | proxies=self.download_config.proxies, |
| 600 | ) |
| 601 | dataset_card_data = DatasetCard.load(dataset_readme_path).data |
| 602 | except EntryNotFoundError: |
| 603 | dataset_card_data = DatasetCardData() |
| 604 | download_config = self.download_config.copy() |
| 605 | if download_config.download_desc is None: |
| 606 | download_config.download_desc = "Downloading standalone yaml" |
| 607 | try: |
| 608 | standalone_yaml_path = cached_path( |
| 609 | hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=self.commit_hash), |
| 610 | download_config=download_config, |
| 611 | ) |
| 612 | with open(standalone_yaml_path, encoding="utf-8") as f: |
| 613 | standalone_yaml_data = yaml.safe_load(f.read()) |
| 614 | if standalone_yaml_data: |
| 615 | _dataset_card_data_dict = dataset_card_data.to_dict() |
| 616 | _dataset_card_data_dict.update(standalone_yaml_data) |
| 617 | dataset_card_data = DatasetCardData(**_dataset_card_data_dict) |
| 618 | except FileNotFoundError: |
| 619 | pass |
| 620 | base_path = f"hf://datasets/{self.name}@{self.commit_hash}/{self.data_dir or ''}".rstrip("/") |
| 621 | metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) |
| 622 | dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data) |
| 623 | if config.USE_PARQUET_EXPORT and self.use_exported_dataset_infos: |
| 624 | try: |
| 625 | exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos( |
| 626 | dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token |
| 627 | ) |
| 628 | exported_dataset_infos = DatasetInfosDict( |
| 629 | { |
| 630 | config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name]) |
| 631 | for config_name in exported_dataset_infos |
| 632 | } |
| 633 | ) |
| 634 | except _dataset_viewer.DatasetViewerError: |
| 635 | exported_dataset_infos = None |
| 636 | else: |
| 637 | exported_dataset_infos = None |
| 638 | if exported_dataset_infos: |
| 639 | exported_dataset_infos.update(dataset_infos) |
| 640 | dataset_infos = exported_dataset_infos |
| 641 | # we need a set of data files to find which dataset builder to use |