MCPcopy
hub / github.com/huggingface/datasets / get_module

Method get_module

src/datasets/load.py:429–515  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

427 self.download_mode = download_mode
428
429 def get_module(self) -> DatasetModule:
430 readme_path = os.path.join(self.path, config.REPOCARD_FILENAME)
431 standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME)
432 dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData()
433 if os.path.exists(standalone_yaml_path):
434 with open(standalone_yaml_path, encoding="utf-8") as f:
435 standalone_yaml_data = yaml.safe_load(f.read())
436 if standalone_yaml_data:
437 _dataset_card_data_dict = dataset_card_data.to_dict()
438 _dataset_card_data_dict.update(standalone_yaml_data)
439 dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
440 metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
441 dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
442 # we need a set of data files to find which dataset builder to use
443 # because we need to infer module name by files extensions
444 base_path = Path(self.path, self.data_dir or "").expanduser().resolve().as_posix()
445 if self.data_files is not None:
446 patterns = sanitize_patterns(self.data_files)
447 elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())):
448 patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"])
449 else:
450 patterns = get_data_patterns(base_path)
451 data_files = DataFilesDict.from_patterns(
452 patterns,
453 base_path=base_path,
454 allowed_extensions=_ALL_ALLOWED_EXTENSIONS,
455 )
456 module_name, default_builder_kwargs = infer_module_for_data_files(
457 data_files=data_files,
458 path=self.path,
459 )
460 data_files = data_files.filter(
461 extensions=_MODULE_TO_EXTENSIONS[module_name] + _MODULE_TO_METADATA_EXTENSIONS[module_name],
462 file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name],
463 )
464 module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
465 if metadata_configs:
466 builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
467 module_path,
468 metadata_configs,
469 base_path=base_path,
470 default_builder_kwargs=default_builder_kwargs,
471 )
472 else:
473 builder_configs: list[BuilderConfig] = [
474 import_main_class(module_path).BUILDER_CONFIG_CLASS(
475 data_files=data_files,
476 **default_builder_kwargs,
477 )
478 ]
479 default_config_name = None
480 builder_kwargs = {
481 "base_path": self.path,
482 "dataset_name": camelcase_to_snakecase(Path(self.path).name),
483 }
484 if self.data_dir:
485 builder_kwargs["data_files"] = data_files
486 # this file is deprecated and was created automatically in old versions of push_to_hub

Calls 15

sanitize_patternsFunction · 0.85
get_data_patternsFunction · 0.85
import_main_classFunction · 0.85
camelcase_to_snakecaseFunction · 0.85
DatasetInfosDictClass · 0.85
DatasetModuleClass · 0.85
existsMethod · 0.80
itemsMethod · 0.80
hashMethod · 0.80