MCPcopy Index your code
hub / github.com/huggingface/datasets / LocalDatasetModuleFactory

Class LocalDatasetModuleFactory

src/datasets/load.py:409–515  ·  view source on GitHub ↗

Get the module of a dataset loaded from the user's data files. The dataset builder module to use is inferred from the data files extensions.

Source from the content-addressed store, hash-verified

407
408
409class LocalDatasetModuleFactory(_DatasetModuleFactory):
410 """Get the module of a dataset loaded from the user's data files. The dataset builder module to use is inferred
411 from the data files extensions."""
412
413 def __init__(
414 self,
415 path: str,
416 data_dir: Optional[str] = None,
417 data_files: Optional[Union[str, list, dict]] = None,
418 download_mode: Optional[Union[DownloadMode, str]] = None,
419 ):
420 if data_dir and os.path.isabs(data_dir):
421 raise ValueError(f"`data_dir` must be relative to a dataset directory's root: {path}")
422
423 self.path = Path(path).as_posix()
424 self.name = Path(path).stem
425 self.data_files = data_files
426 self.data_dir = data_dir
427 self.download_mode = download_mode
428
429 def get_module(self) -> DatasetModule:
430 readme_path = os.path.join(self.path, config.REPOCARD_FILENAME)
431 standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME)
432 dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData()
433 if os.path.exists(standalone_yaml_path):
434 with open(standalone_yaml_path, encoding="utf-8") as f:
435 standalone_yaml_data = yaml.safe_load(f.read())
436 if standalone_yaml_data:
437 _dataset_card_data_dict = dataset_card_data.to_dict()
438 _dataset_card_data_dict.update(standalone_yaml_data)
439 dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
440 metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
441 dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
442 # we need a set of data files to find which dataset builder to use
443 # because we need to infer module name by files extensions
444 base_path = Path(self.path, self.data_dir or "").expanduser().resolve().as_posix()
445 if self.data_files is not None:
446 patterns = sanitize_patterns(self.data_files)
447 elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())):
448 patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"])
449 else:
450 patterns = get_data_patterns(base_path)
451 data_files = DataFilesDict.from_patterns(
452 patterns,
453 base_path=base_path,
454 allowed_extensions=_ALL_ALLOWED_EXTENSIONS,
455 )
456 module_name, default_builder_kwargs = infer_module_for_data_files(
457 data_files=data_files,
458 path=self.path,
459 )
460 data_files = data_files.filter(
461 extensions=_MODULE_TO_EXTENSIONS[module_name] + _MODULE_TO_METADATA_EXTENSIONS[module_name],
462 file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name],
463 )
464 module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
465 if metadata_configs:
466 builder_configs, default_config_name = create_builder_configs_from_metadata_configs(

Calls

no outgoing calls