MCPcopy
hub / github.com/huggingface/datasets / __init__

Method __init__

src/datasets/builder.py:308–423  ·  view source on GitHub ↗
(
        self,
        cache_dir: Optional[str] = None,
        dataset_name: Optional[str] = None,
        config_name: Optional[str] = None,
        hash: Optional[str] = None,
        base_path: Optional[str] = None,
        info: Optional[DatasetInfo] = None,
        features: Optional[Features] = None,
        token: Optional[Union[bool, str]] = None,
        repo_id: Optional[str] = None,
        data_files: Optional[Union[str, list, dict, DataFilesDict]] = None,
        data_dir: Optional[str] = None,
        storage_options: Optional[dict] = None,
        writer_batch_size: Optional[int] = None,
        config_id: Optional[str] = None,
        **config_kwargs,
    )

Source from the content-addressed store, hash-verified

306 SLEEP_ON_THREADS_SHUTDOWNS = False
307
308 def __init__(
309 self,
310 cache_dir: Optional[str] = None,
311 dataset_name: Optional[str] = None,
312 config_name: Optional[str] = None,
313 hash: Optional[str] = None,
314 base_path: Optional[str] = None,
315 info: Optional[DatasetInfo] = None,
316 features: Optional[Features] = None,
317 token: Optional[Union[bool, str]] = None,
318 repo_id: Optional[str] = None,
319 data_files: Optional[Union[str, list, dict, DataFilesDict]] = None,
320 data_dir: Optional[str] = None,
321 storage_options: Optional[dict] = None,
322 writer_batch_size: Optional[int] = None,
323 config_id: Optional[str] = None,
324 **config_kwargs,
325 ):
326 # DatasetBuilder name
327 self.name: str = camelcase_to_snakecase(self.__module__.split(".")[-1])
328 self.hash: Optional[str] = hash
329 self.base_path = base_path
330 self.token = token
331 self.repo_id = repo_id
332 self.storage_options = storage_options or {}
333 self.dataset_name = camelcase_to_snakecase(dataset_name) if dataset_name else self.name
334 self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE
335
336 if data_files is not None and not isinstance(data_files, DataFilesDict):
337 data_files = DataFilesDict.from_patterns(
338 sanitize_patterns(data_files),
339 base_path=base_path,
340 download_config=DownloadConfig(token=token, storage_options=self.storage_options),
341 )
342
343 # Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
344 if "features" in inspect.signature(self.BUILDER_CONFIG_CLASS.__init__).parameters and features is not None:
345 config_kwargs["features"] = features
346 if data_files is not None:
347 config_kwargs["data_files"] = data_files
348 if data_dir is not None:
349 config_kwargs["data_dir"] = data_dir
350 self.config_kwargs = config_kwargs
351 self.config, self.config_id = self._create_builder_config(
352 config_name=config_name,
353 custom_features=features,
354 config_id=config_id,
355 **config_kwargs,
356 )
357
358 # prepare info: DatasetInfo are a standardized dataclass across all datasets
359 # Prefill datasetinfo
360 if info is None:
361 info = self._info()
362 info.builder_name = self.name
363 info.dataset_name = self.dataset_name
364 info.config_name = self.config.name
365 info.version = self.config.version

Callers

nothing calls this directly

Calls 15

_infoMethod · 0.95
_build_cache_dirMethod · 0.95
camelcase_to_snakecaseFunction · 0.85
sanitize_patternsFunction · 0.85
DownloadConfigClass · 0.85
is_remote_urlFunction · 0.85
FileLockClass · 0.85
splitMethod · 0.80
makedirsMethod · 0.80
existsMethod · 0.80

Tested by

no test coverage detected