Writes the dataset to `WebDataset `_ files. The `TFRecord `_ files will contain `tf.train.Example `_ # noqa: E501
(
self,
path: str,
*,
filesystem: Optional["pyarrow.fs.FileSystem"] = None,
try_create_dir: bool = True,
arrow_open_stream_args: Optional[Dict[str, Any]] = None,
filename_provider: Optional[FilenameProvider] = None,
min_rows_per_file: Optional[int] = None,
ray_remote_args: Dict[str, Any] = None,
encoder: Optional[Union[bool, str, callable, list]] = True,
concurrency: Optional[int] = None,
num_rows_per_file: Optional[int] = None,
mode: SaveMode = SaveMode.APPEND,
)
| 4988 | @ConsumptionAPI |
| 4989 | @PublicAPI(stability="alpha", api_group=IOC_API_GROUP) |
| 4990 | def write_webdataset( |
| 4991 | self, |
| 4992 | path: str, |
| 4993 | *, |
| 4994 | filesystem: Optional["pyarrow.fs.FileSystem"] = None, |
| 4995 | try_create_dir: bool = True, |
| 4996 | arrow_open_stream_args: Optional[Dict[str, Any]] = None, |
| 4997 | filename_provider: Optional[FilenameProvider] = None, |
| 4998 | min_rows_per_file: Optional[int] = None, |
| 4999 | ray_remote_args: Dict[str, Any] = None, |
| 5000 | encoder: Optional[Union[bool, str, callable, list]] = True, |
| 5001 | concurrency: Optional[int] = None, |
| 5002 | num_rows_per_file: Optional[int] = None, |
| 5003 | mode: SaveMode = SaveMode.APPEND, |
| 5004 | ) -> None: |
| 5005 | """Writes the dataset to `WebDataset <https://github.com/webdataset/webdataset>`_ files. |
| 5006 | |
| 5007 | The `TFRecord <https://www.tensorflow.org/tutorials/load_data/tfrecord>`_ |
| 5008 | files will contain |
| 5009 | `tf.train.Example <https://www.tensorflow.org/api_docs/python/tf/train/Example>`_ # noqa: E501 |
| 5010 | records, with one Example record for each row in the dataset. |
| 5011 | |
| 5012 | .. warning:: |
| 5013 | tf.train.Feature only natively stores ints, floats, and bytes, |
| 5014 | so this function only supports datasets with these data types, |
| 5015 | and will error if the dataset contains unsupported types. |
| 5016 | |
| 5017 | This is only supported for datasets convertible to Arrow records. |
| 5018 | To control the number of files, use :meth:`Dataset.repartition`. |
| 5019 | |
| 5020 | Unless a custom filename provider is given, the format of the output |
| 5021 | files is ``{uuid}_{block_idx}.tfrecords``, where ``uuid`` is a unique id |
| 5022 | for the dataset. |
| 5023 | |
| 5024 | Examples: |
| 5025 | |
| 5026 | .. testcode:: |
| 5027 | :skipif: True |
| 5028 | |
| 5029 | import ray |
| 5030 | |
| 5031 | ds = ray.data.range(100) |
| 5032 | ds.write_webdataset("s3://bucket/folder/") |
| 5033 | |
| 5034 | Time complexity: O(dataset size / parallelism) |
| 5035 | |
| 5036 | Args: |
| 5037 | path: The path to the destination root directory, where tfrecords |
| 5038 | files are written to. |
| 5039 | filesystem: The filesystem implementation to write to. |
| 5040 | try_create_dir: If ``True``, attempts to create all |
| 5041 | directories in the destination path. Does nothing if all directories |
| 5042 | already exist. Defaults to ``True``. |
| 5043 | arrow_open_stream_args: kwargs passed to |
| 5044 | ``pyarrow.fs.FileSystem.open_output_stream`` |
| 5045 | filename_provider: A :class:`~ray.data.datasource.FilenameProvider` |
| 5046 | implementation. Use this parameter to customize what your filenames |
| 5047 | look like. |