Take the data_files patterns from the user, and format them into a dictionary. Each key is the name of the split, and each value is a list of data files patterns (paths or urls). The default split is "train". Returns: patterns: dictionary of split_name -> list of patterns
(patterns: Union[dict, list, str])
| 119 | |
| 120 | |
| 121 | def sanitize_patterns(patterns: Union[dict, list, str]) -> dict[str, Union[list[str], "DataFilesList"]]: |
| 122 | """ |
| 123 | Take the data_files patterns from the user, and format them into a dictionary. |
| 124 | Each key is the name of the split, and each value is a list of data files patterns (paths or urls). |
| 125 | The default split is "train". |
| 126 | |
| 127 | Returns: |
| 128 | patterns: dictionary of split_name -> list of patterns |
| 129 | """ |
| 130 | if isinstance(patterns, dict): |
| 131 | return {str(key): value if isinstance(value, list) else [value] for key, value in patterns.items()} |
| 132 | elif isinstance(patterns, str): |
| 133 | return {SANITIZED_DEFAULT_SPLIT: [patterns]} |
| 134 | elif isinstance(patterns, list): |
| 135 | if any(isinstance(pattern, dict) for pattern in patterns): |
| 136 | for pattern in patterns: |
| 137 | if not ( |
| 138 | isinstance(pattern, dict) |
| 139 | and len(pattern) == 2 |
| 140 | and "split" in pattern |
| 141 | and isinstance(pattern.get("path"), (str, list)) |
| 142 | ): |
| 143 | raise ValueError( |
| 144 | "Invalid format for data_files entry. " |
| 145 | "Each item must be a dictionary with the structure " |
| 146 | "{'split': <split_name>, 'path': <path_or_list_of_paths>}.\n" |
| 147 | f"Received: {pattern}" |
| 148 | ) |
| 149 | splits = [pattern["split"] for pattern in patterns] |
| 150 | if len(set(splits)) != len(splits): |
| 151 | raise ValueError(f"Some splits are duplicated in data_files: {splits}") |
| 152 | return { |
| 153 | str(pattern["split"]): pattern["path"] if isinstance(pattern["path"], list) else [pattern["path"]] |
| 154 | for pattern in patterns |
| 155 | } |
| 156 | else: |
| 157 | return {SANITIZED_DEFAULT_SPLIT: patterns} |
| 158 | else: |
| 159 | return sanitize_patterns(list(patterns)) |
| 160 | |
| 161 | |
| 162 | def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool: |
no test coverage detected