MCPcopy
hub / github.com/huggingface/datasets / get_module

Method get_module

src/datasets/load.py:865–962  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

863 self.download_mode = download_mode
864
865 def get_module(self) -> DatasetModule:
866 hffs = HfFileSystem(
867 endpoint=config.HF_ENDPOINT,
868 token=self.download_config.token,
869 )
870 readme_path = xjoin(self.path, config.REPOCARD_FILENAME)
871 standalone_yaml_path = xjoin(self.path, config.REPOYAML_FILENAME)
872 try:
873 dataset_card_data = DatasetCard(hffs.read_text(readme_path, newline="", encoding="utf-8"))
874 except FileNotFoundError:
875 dataset_card_data = DatasetCardData()
876 try:
877 standalone_yaml_data = yaml.safe_load(hffs.read_text(standalone_yaml_path, newline="", encoding="utf-8"))
878 except FileNotFoundError:
879 dataset_card_data = DatasetCardData()
880 if hffs.exists(standalone_yaml_path):
881 with hffs.open(standalone_yaml_path, "r", encoding="utf-8") as f:
882 standalone_yaml_data = yaml.safe_load(f.read())
883 if standalone_yaml_data:
884 _dataset_card_data_dict = dataset_card_data.to_dict()
885 _dataset_card_data_dict.update(standalone_yaml_data)
886 dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
887 metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
888 dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
889 # we need a set of data files to find which dataset builder to use
890 # because we need to infer module name by files extensions
891 base_path = "hf://" + Path(self.path, self.data_dir or "").as_posix()
892 if self.data_files is not None:
893 patterns = sanitize_patterns(self.data_files)
894 elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())):
895 patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"])
896 else:
897 patterns = get_data_patterns(base_path, download_config=self.download_config)
898 data_files = DataFilesDict.from_patterns(
899 patterns,
900 base_path=base_path,
901 allowed_extensions=_ALL_ALLOWED_EXTENSIONS,
902 )
903 module_name, default_builder_kwargs = infer_module_for_data_files(
904 data_files=data_files,
905 path=self.path,
906 )
907 data_files = data_files.filter(
908 extensions=_MODULE_TO_EXTENSIONS[module_name] + _MODULE_TO_METADATA_EXTENSIONS[module_name],
909 file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name],
910 )
911 module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
912 if metadata_configs:
913 builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
914 module_path,
915 metadata_configs,
916 base_path=base_path,
917 default_builder_kwargs=default_builder_kwargs,
918 )
919 else:
920 builder_configs: list[BuilderConfig] = [
921 import_main_class(module_path).BUILDER_CONFIG_CLASS(
922 data_files=data_files,

Callers

nothing calls this directly

Calls 15

xjoinFunction · 0.85
sanitize_patternsFunction · 0.85
get_data_patternsFunction · 0.85
import_main_classFunction · 0.85
camelcase_to_snakecaseFunction · 0.85
DatasetInfosDictClass · 0.85
DatasetModuleClass · 0.85
existsMethod · 0.80
openMethod · 0.80

Tested by

no test coverage detected