(self)
| 863 | self.download_mode = download_mode |
| 864 | |
| 865 | def get_module(self) -> DatasetModule: |
| 866 | hffs = HfFileSystem( |
| 867 | endpoint=config.HF_ENDPOINT, |
| 868 | token=self.download_config.token, |
| 869 | ) |
| 870 | readme_path = xjoin(self.path, config.REPOCARD_FILENAME) |
| 871 | standalone_yaml_path = xjoin(self.path, config.REPOYAML_FILENAME) |
| 872 | try: |
| 873 | dataset_card_data = DatasetCard(hffs.read_text(readme_path, newline="", encoding="utf-8")) |
| 874 | except FileNotFoundError: |
| 875 | dataset_card_data = DatasetCardData() |
| 876 | try: |
| 877 | standalone_yaml_data = yaml.safe_load(hffs.read_text(standalone_yaml_path, newline="", encoding="utf-8")) |
| 878 | except FileNotFoundError: |
| 879 | dataset_card_data = DatasetCardData() |
| 880 | if hffs.exists(standalone_yaml_path): |
| 881 | with hffs.open(standalone_yaml_path, "r", encoding="utf-8") as f: |
| 882 | standalone_yaml_data = yaml.safe_load(f.read()) |
| 883 | if standalone_yaml_data: |
| 884 | _dataset_card_data_dict = dataset_card_data.to_dict() |
| 885 | _dataset_card_data_dict.update(standalone_yaml_data) |
| 886 | dataset_card_data = DatasetCardData(**_dataset_card_data_dict) |
| 887 | metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data) |
| 888 | dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data) |
| 889 | # we need a set of data files to find which dataset builder to use |
| 890 | # because we need to infer module name by files extensions |
| 891 | base_path = "hf://" + Path(self.path, self.data_dir or "").as_posix() |
| 892 | if self.data_files is not None: |
| 893 | patterns = sanitize_patterns(self.data_files) |
| 894 | elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())): |
| 895 | patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"]) |
| 896 | else: |
| 897 | patterns = get_data_patterns(base_path, download_config=self.download_config) |
| 898 | data_files = DataFilesDict.from_patterns( |
| 899 | patterns, |
| 900 | base_path=base_path, |
| 901 | allowed_extensions=_ALL_ALLOWED_EXTENSIONS, |
| 902 | ) |
| 903 | module_name, default_builder_kwargs = infer_module_for_data_files( |
| 904 | data_files=data_files, |
| 905 | path=self.path, |
| 906 | ) |
| 907 | data_files = data_files.filter( |
| 908 | extensions=_MODULE_TO_EXTENSIONS[module_name] + _MODULE_TO_METADATA_EXTENSIONS[module_name], |
| 909 | file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name], |
| 910 | ) |
| 911 | module_path, _ = _PACKAGED_DATASETS_MODULES[module_name] |
| 912 | if metadata_configs: |
| 913 | builder_configs, default_config_name = create_builder_configs_from_metadata_configs( |
| 914 | module_path, |
| 915 | metadata_configs, |
| 916 | base_path=base_path, |
| 917 | default_builder_kwargs=default_builder_kwargs, |
| 918 | ) |
| 919 | else: |
| 920 | builder_configs: list[BuilderConfig] = [ |
| 921 | import_main_class(module_path).BUILDER_CONFIG_CLASS( |
| 922 | data_files=data_files, |
nothing calls this directly
no test coverage detected