The config id is used to build the cache directory. By default it is equal to the config name. However the name of a config is not sufficient to have a unique identifier for the dataset being generated since it doesn't take into account: - the config kwargs t
(
self,
config_kwargs: dict,
custom_features: Optional[Features] = None,
)
| 142 | return all((k, getattr(self, k)) == (k, getattr(o, k)) for k in self.__dict__.keys()) |
| 143 | |
| 144 | def create_config_id( |
| 145 | self, |
| 146 | config_kwargs: dict, |
| 147 | custom_features: Optional[Features] = None, |
| 148 | ) -> str: |
| 149 | """ |
| 150 | The config id is used to build the cache directory. |
| 151 | By default it is equal to the config name. |
| 152 | However the name of a config is not sufficient to have a unique identifier for the dataset being generated |
| 153 | since it doesn't take into account: |
| 154 | - the config kwargs that can be used to overwrite attributes |
| 155 | - the custom features used to write the dataset |
| 156 | - the data_files for json/text/csv/pandas datasets |
| 157 | |
| 158 | Therefore the config id is just the config name with an optional suffix based on these. |
| 159 | """ |
| 160 | # Possibly add a suffix to the name to handle custom features/data_files/config_kwargs |
| 161 | suffix: Optional[str] = None |
| 162 | config_kwargs_to_add_to_suffix = config_kwargs.copy() |
| 163 | # name and version are already used to build the cache directory |
| 164 | config_kwargs_to_add_to_suffix.pop("name", None) |
| 165 | config_kwargs_to_add_to_suffix.pop("version", None) |
| 166 | # data dir handling (when specified it points to the manually downloaded data): |
| 167 | # it was previously ignored before the introduction of config id because we didn't want |
| 168 | # to change the config name. Now it's fine to take it into account for the config id. |
| 169 | # config_kwargs_to_add_to_suffix.pop("data_dir", None) |
| 170 | if "data_dir" in config_kwargs_to_add_to_suffix: |
| 171 | if config_kwargs_to_add_to_suffix["data_dir"] is None: |
| 172 | config_kwargs_to_add_to_suffix.pop("data_dir", None) |
| 173 | else: |
| 174 | # canonicalize the data dir to avoid two paths to the same location having different |
| 175 | # hashes |
| 176 | data_dir = config_kwargs_to_add_to_suffix["data_dir"] |
| 177 | data_dir = os.path.normpath(data_dir) |
| 178 | config_kwargs_to_add_to_suffix["data_dir"] = data_dir |
| 179 | if config_kwargs_to_add_to_suffix: |
| 180 | # we don't care about the order of the kwargs |
| 181 | config_kwargs_to_add_to_suffix = { |
| 182 | k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix) |
| 183 | } |
| 184 | if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()): |
| 185 | suffix = ",".join( |
| 186 | str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items() |
| 187 | ) |
| 188 | if len(suffix) > 32: # hash if too long |
| 189 | suffix = Hasher.hash(config_kwargs_to_add_to_suffix) |
| 190 | else: |
| 191 | suffix = Hasher.hash(config_kwargs_to_add_to_suffix) |
| 192 | |
| 193 | if custom_features is not None: |
| 194 | m = Hasher() |
| 195 | if suffix: |
| 196 | m.update(suffix) |
| 197 | m.update(custom_features) |
| 198 | suffix = m.hexdigest() |
| 199 | |
| 200 | if suffix: |
| 201 | config_id = self.name + "-" + suffix |