MCPcopy
hub / github.com/huggingface/datasets / get_module

Method get_module

src/datasets/load.py:584–725  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

582 increase_load_count(name)
583
584 def get_module(self) -> DatasetModule:
585 # Get the Dataset Card and fix the revision in case there are new commits in the meantime
586 api = HfApi(
587 endpoint=config.HF_ENDPOINT,
588 token=self.download_config.token,
589 library_name="datasets",
590 library_version=__version__,
591 user_agent=get_datasets_user_agent(self.download_config.user_agent),
592 )
593 try:
594 dataset_readme_path = api.hf_hub_download(
595 repo_id=self.name,
596 filename=config.REPOCARD_FILENAME,
597 repo_type="dataset",
598 revision=self.commit_hash,
599 proxies=self.download_config.proxies,
600 )
601 dataset_card_data = DatasetCard.load(dataset_readme_path).data
602 except EntryNotFoundError:
603 dataset_card_data = DatasetCardData()
604 download_config = self.download_config.copy()
605 if download_config.download_desc is None:
606 download_config.download_desc = "Downloading standalone yaml"
607 try:
608 standalone_yaml_path = cached_path(
609 hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=self.commit_hash),
610 download_config=download_config,
611 )
612 with open(standalone_yaml_path, encoding="utf-8") as f:
613 standalone_yaml_data = yaml.safe_load(f.read())
614 if standalone_yaml_data:
615 _dataset_card_data_dict = dataset_card_data.to_dict()
616 _dataset_card_data_dict.update(standalone_yaml_data)
617 dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
618 except FileNotFoundError:
619 pass
620 base_path = f"hf://datasets/{self.name}@{self.commit_hash}/{self.data_dir or ''}".rstrip("/")
621 metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
622 dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
623 if config.USE_PARQUET_EXPORT and self.use_exported_dataset_infos:
624 try:
625 exported_dataset_infos = _dataset_viewer.get_exported_dataset_infos(
626 dataset=self.name, commit_hash=self.commit_hash, token=self.download_config.token
627 )
628 exported_dataset_infos = DatasetInfosDict(
629 {
630 config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name])
631 for config_name in exported_dataset_infos
632 }
633 )
634 except _dataset_viewer.DatasetViewerError:
635 exported_dataset_infos = None
636 else:
637 exported_dataset_infos = None
638 if exported_dataset_infos:
639 exported_dataset_infos.update(dataset_infos)
640 dataset_infos = exported_dataset_infos
641 # we need a set of data files to find which dataset builder to use

Calls 15

get_datasets_user_agentFunction · 0.85
cached_pathFunction · 0.85
DatasetInfosDictClass · 0.85
sanitize_patternsFunction · 0.85
get_data_patternsFunction · 0.85
import_main_classFunction · 0.85
camelcase_to_snakecaseFunction · 0.85
DatasetModuleClass · 0.85
itemsMethod · 0.80