MCPcopy
hub / github.com/huggingface/datasets / HubDatasetModuleFactory

Class HubDatasetModuleFactory

src/datasets/load.py:559–725  ·  view source on GitHub ↗

Get the module of a dataset loaded from data files of a dataset repository. The dataset builder module to use is inferred from the data files extensions.

Source from the content-addressed store, hash-verified

557
558
559class HubDatasetModuleFactory(_DatasetModuleFactory):
560 """
561 Get the module of a dataset loaded from data files of a dataset repository.
562 The dataset builder module to use is inferred from the data files extensions.
563 """
564
565 def __init__(
566 self,
567 name: str,
568 commit_hash: str,
569 data_dir: Optional[str] = None,
570 data_files: Optional[Union[str, list, dict]] = None,
571 download_config: Optional[DownloadConfig] = None,
572 download_mode: Optional[Union[DownloadMode, str]] = None,
573 use_exported_dataset_infos: bool = False,
574 ):
575 self.name = name
576 self.commit_hash = commit_hash
577 self.data_files = data_files
578 self.data_dir = data_dir
579 self.download_config = download_config or DownloadConfig()
580 self.download_mode = download_mode
581 self.use_exported_dataset_infos = use_exported_dataset_infos
582 increase_load_count(name)
583
584 def get_module(self) -> DatasetModule:
585 # Get the Dataset Card and fix the revision in case there are new commits in the meantime
586 api = HfApi(
587 endpoint=config.HF_ENDPOINT,
588 token=self.download_config.token,
589 library_name="datasets",
590 library_version=__version__,
591 user_agent=get_datasets_user_agent(self.download_config.user_agent),
592 )
593 try:
594 dataset_readme_path = api.hf_hub_download(
595 repo_id=self.name,
596 filename=config.REPOCARD_FILENAME,
597 repo_type="dataset",
598 revision=self.commit_hash,
599 proxies=self.download_config.proxies,
600 )
601 dataset_card_data = DatasetCard.load(dataset_readme_path).data
602 except EntryNotFoundError:
603 dataset_card_data = DatasetCardData()
604 download_config = self.download_config.copy()
605 if download_config.download_desc is None:
606 download_config.download_desc = "Downloading standalone yaml"
607 try:
608 standalone_yaml_path = cached_path(
609 hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=self.commit_hash),
610 download_config=download_config,
611 )
612 with open(standalone_yaml_path, encoding="utf-8") as f:
613 standalone_yaml_data = yaml.safe_load(f.read())
614 if standalone_yaml_data:
615 _dataset_card_data_dict = dataset_card_data.to_dict()
616 _dataset_card_data_dict.update(standalone_yaml_data)

Calls

no outgoing calls