MCPcopy
hub / github.com/huggingface/datasets / sanitize_patterns

Function sanitize_patterns

src/datasets/data_files.py:121–159  ·  view source on GitHub ↗

Take the data_files patterns from the user, and format them into a dictionary. Each key is the name of the split, and each value is a list of data files patterns (paths or urls). The default split is "train". Returns: patterns: dictionary of split_name -> list of patterns

(patterns: Union[dict, list, str])

Source from the content-addressed store, hash-verified

119
120
121def sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[str], "DataFilesList"]]:
122 """
123 Take the data_files patterns from the user, and format them into a dictionary.
124 Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
125 The default split is "train".
126
127 Returns:
128 patterns: dictionary of split_name -> list of patterns
129 """
130 if isinstance(patterns, dict):
131 return {str(key): value if isinstance(value, list) else [value] for key, value in patterns.items()}
132 elif isinstance(patterns, str):
133 return {SANITIZED_DEFAULT_SPLIT: [patterns]}
134 elif isinstance(patterns, list):
135 if any(isinstance(pattern, dict) for pattern in patterns):
136 for pattern in patterns:
137 if not (
138 isinstance(pattern, dict)
139 and len(pattern) == 2
140 and "split" in pattern
141 and isinstance(pattern.get("path"), (str, list))
142 ):
143 raise ValueError(
144 "Invalid format for data_files entry. "
145 "Each item must be a dictionary with the structure "
146 "{'split': <split_name>, 'path': <path_or_list_of_paths>}.\n"
147 f"Received: {pattern}"
148 )
149 splits = [pattern["split"] for pattern in patterns]
150 if len(set(splits)) != len(splits):
151 raise ValueError(f"Some splits are duplicated in data_files: {splits}")
152 return {
153 str(pattern["split"]): pattern["path"] if isinstance(pattern["path"], list) else [pattern["path"]]
154 for pattern in patterns
155 }
156 else:
157 return {SANITIZED_DEFAULT_SPLIT: patterns}
158 else:
159 return sanitize_patterns(list(patterns))
160
161
162def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool:

Callers 7

__init__Method · 0.85
get_moduleMethod · 0.85
get_moduleMethod · 0.85
get_moduleMethod · 0.85
get_moduleMethod · 0.85

Calls 1

itemsMethod · 0.80

Tested by

no test coverage detected