Embed data into an arrays's storage. For custom features like Audio or Image, it takes into account the "embed_storage" methods they define to embed external data (e.g. an image file) into an array. Args: array (`pa.Array`): The PyArrow
(
array: pa.Array,
feature: "FeatureType",
token_per_repo_id=None,
local_files: bool = True,
remote_files: bool = True,
)
| 2151 | |
| 2152 | @_wrap_for_chunked_arrays |
| 2153 | def embed_array_storage( |
| 2154 | array: pa.Array, |
| 2155 | feature: "FeatureType", |
| 2156 | token_per_repo_id=None, |
| 2157 | local_files: bool = True, |
| 2158 | remote_files: bool = True, |
| 2159 | ): |
| 2160 | """Embed data into an arrays's storage. |
| 2161 | For custom features like Audio or Image, it takes into account the "embed_storage" methods |
| 2162 | they define to embed external data (e.g. an image file) into an array. |
| 2163 | |
| 2164 | <Added version="2.4.0"/> |
| 2165 | |
| 2166 | Args: |
| 2167 | array (`pa.Array`): |
| 2168 | The PyArrow array in which to embed data. |
| 2169 | feature (`datasets.features.FeatureType`): |
| 2170 | Array features. |
| 2171 | local_files (`bool`, defaults to `True`) |
| 2172 | Whether to embed local files data in the array |
| 2173 | |
| 2174 | <Added version="4.8.5"/> |
| 2175 | remote_files (`bool`, defaults to `True`) |
| 2176 | Whether to embed remote files data in the array. |
| 2177 | E.g. files with paths that start with hf:// or https:// |
| 2178 | |
| 2179 | <Added version="4.8.5"/> |
| 2180 | |
| 2181 | Raises: |
| 2182 | `TypeError`: if the target type is not supported according, e.g. |
| 2183 | |
| 2184 | - if a field is missing |
| 2185 | |
| 2186 | Returns: |
| 2187 | array (`pyarrow.Array`): the casted array |
| 2188 | """ |
| 2189 | if not local_files and not remote_files: |
| 2190 | return array |
| 2191 | |
| 2192 | from .features import LargeList, List |
| 2193 | |
| 2194 | _e = partial( |
| 2195 | embed_array_storage, token_per_repo_id=token_per_repo_id, local_files=local_files, remote_files=remote_files |
| 2196 | ) |
| 2197 | |
| 2198 | if isinstance(array, pa.ExtensionArray): |
| 2199 | array = array.storage |
| 2200 | if hasattr(feature, "embed_storage"): |
| 2201 | return feature.embed_storage( |
| 2202 | array, token_per_repo_id=token_per_repo_id, local_files=local_files, remote_files=remote_files |
| 2203 | ) |
| 2204 | elif pa.types.is_struct(array.type): |
| 2205 | # feature must be a dict |
| 2206 | if isinstance(feature, dict): |
| 2207 | arrays = [_e(array.field(name), subfeature) for name, subfeature in feature.items()] |
| 2208 | return pa.StructArray.from_arrays(arrays, names=list(feature), mask=array.is_null()) |
| 2209 | elif pa.types.is_list(array.type): |
| 2210 | # feature must be either List(subfeature) |