MCPcopy
hub / github.com/huggingface/datasets / create_config_id

Method create_config_id

src/datasets/builder.py:144–206  ·  view source on GitHub ↗

The config id is used to build the cache directory. By default it is equal to the config name. However the name of a config is not sufficient to have a unique identifier for the dataset being generated since it doesn't take into account: - the config kwargs t

(
        self,
        config_kwargs: dict,
        custom_features: Optional[Features] = None,
    )

Source from the content-addressed store, hash-verified

142 return all((k, getattr(self, k)) == (k, getattr(o, k)) for k in self.__dict__.keys())
143
144 def create_config_id(
145 self,
146 config_kwargs: dict,
147 custom_features: Optional[Features] = None,
148 ) -> str:
149 """
150 The config id is used to build the cache directory.
151 By default it is equal to the config name.
152 However the name of a config is not sufficient to have a unique identifier for the dataset being generated
153 since it doesn't take into account:
154 - the config kwargs that can be used to overwrite attributes
155 - the custom features used to write the dataset
156 - the data_files for json/text/csv/pandas datasets
157
158 Therefore the config id is just the config name with an optional suffix based on these.
159 """
160 # Possibly add a suffix to the name to handle custom features/data_files/config_kwargs
161 suffix: Optional[str] = None
162 config_kwargs_to_add_to_suffix = config_kwargs.copy()
163 # name and version are already used to build the cache directory
164 config_kwargs_to_add_to_suffix.pop("name", None)
165 config_kwargs_to_add_to_suffix.pop("version", None)
166 # data dir handling (when specified it points to the manually downloaded data):
167 # it was previously ignored before the introduction of config id because we didn't want
168 # to change the config name. Now it's fine to take it into account for the config id.
169 # config_kwargs_to_add_to_suffix.pop("data_dir", None)
170 if "data_dir" in config_kwargs_to_add_to_suffix:
171 if config_kwargs_to_add_to_suffix["data_dir"] is None:
172 config_kwargs_to_add_to_suffix.pop("data_dir", None)
173 else:
174 # canonicalize the data dir to avoid two paths to the same location having different
175 # hashes
176 data_dir = config_kwargs_to_add_to_suffix["data_dir"]
177 data_dir = os.path.normpath(data_dir)
178 config_kwargs_to_add_to_suffix["data_dir"] = data_dir
179 if config_kwargs_to_add_to_suffix:
180 # we don't care about the order of the kwargs
181 config_kwargs_to_add_to_suffix = {
182 k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)
183 }
184 if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
185 suffix = ",".join(
186 str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
187 )
188 if len(suffix) > 32: # hash if too long
189 suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
190 else:
191 suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
192
193 if custom_features is not None:
194 m = Hasher()
195 if suffix:
196 m.update(suffix)
197 m.update(custom_features)
198 suffix = m.hexdigest()
199
200 if suffix:
201 config_id = self.name + "-" + suffix

Callers 1

Calls 6

updateMethod · 0.95
hexdigestMethod · 0.95
HasherClass · 0.85
itemsMethod · 0.80
hashMethod · 0.80
copyMethod · 0.45

Tested by

no test coverage detected