Creates a :class:`~ray.data.Dataset` from parquet files. Examples: Read a file in remote storage. >>> import ray >>> ds = ray.data.read_parquet("s3://anonymous@ray-example-data/iris.parquet") >>> ds.schema() Column Type ------ ----
(
paths: Union[str, List[str]],
*,
filesystem: Optional["pyarrow.fs.FileSystem"] = None,
columns: Optional[List[str]] = None,
parallelism: int = -1,
num_cpus: Optional[float] = None,
num_gpus: Optional[float] = None,
memory: Optional[float] = None,
ray_remote_args: Dict[str, Any] = None,
tensor_column_schema: Optional[TensorColumnSchema] = None,
partition_filter: Optional[PathPartitionFilter] = None,
partitioning: Optional[Partitioning] = Partitioning("hive"),
shuffle: Optional[Union[Literal["files"], FileShuffleConfig]] = None,
include_paths: bool = False,
include_row_hash: bool = False,
file_extensions: Optional[List[str]] = ParquetDatasource._FILE_EXTENSIONS,
concurrency: Optional[int] = None,
override_num_blocks: Optional[int] = None,
**arrow_parquet_args,
)
| 1126 | |
| 1127 | @PublicAPI |
| 1128 | def read_parquet( |
| 1129 | paths: Union[str, List[str]], |
| 1130 | *, |
| 1131 | filesystem: Optional["pyarrow.fs.FileSystem"] = None, |
| 1132 | columns: Optional[List[str]] = None, |
| 1133 | parallelism: int = -1, |
| 1134 | num_cpus: Optional[float] = None, |
| 1135 | num_gpus: Optional[float] = None, |
| 1136 | memory: Optional[float] = None, |
| 1137 | ray_remote_args: Dict[str, Any] = None, |
| 1138 | tensor_column_schema: Optional[TensorColumnSchema] = None, |
| 1139 | partition_filter: Optional[PathPartitionFilter] = None, |
| 1140 | partitioning: Optional[Partitioning] = Partitioning("hive"), |
| 1141 | shuffle: Optional[Union[Literal["files"], FileShuffleConfig]] = None, |
| 1142 | include_paths: bool = False, |
| 1143 | include_row_hash: bool = False, |
| 1144 | file_extensions: Optional[List[str]] = ParquetDatasource._FILE_EXTENSIONS, |
| 1145 | concurrency: Optional[int] = None, |
| 1146 | override_num_blocks: Optional[int] = None, |
| 1147 | **arrow_parquet_args, |
| 1148 | ) -> Dataset: |
| 1149 | """Creates a :class:`~ray.data.Dataset` from parquet files. |
| 1150 | |
| 1151 | |
| 1152 | Examples: |
| 1153 | Read a file in remote storage. |
| 1154 | |
| 1155 | >>> import ray |
| 1156 | >>> ds = ray.data.read_parquet("s3://anonymous@ray-example-data/iris.parquet") |
| 1157 | >>> ds.schema() |
| 1158 | Column Type |
| 1159 | ------ ---- |
| 1160 | sepal.length double |
| 1161 | sepal.width double |
| 1162 | petal.length double |
| 1163 | petal.width double |
| 1164 | variety string |
| 1165 | |
| 1166 | Read a directory in remote storage. |
| 1167 | |
| 1168 | >>> ds = ray.data.read_parquet("s3://anonymous@ray-example-data/iris-parquet/") |
| 1169 | |
| 1170 | Read multiple local files. |
| 1171 | |
| 1172 | >>> ray.data.read_parquet( |
| 1173 | ... ["local:///path/to/file1", "local:///path/to/file2"]) # doctest: +SKIP |
| 1174 | |
| 1175 | Specify a schema for the parquet file. |
| 1176 | |
| 1177 | >>> import pyarrow as pa |
| 1178 | >>> fields = [("sepal.length", pa.float32()), |
| 1179 | ... ("sepal.width", pa.float32()), |
| 1180 | ... ("petal.length", pa.float32()), |
| 1181 | ... ("petal.width", pa.float32()), |
| 1182 | ... ("variety", pa.string())] |
| 1183 | >>> ds = ray.data.read_parquet("s3://anonymous@ray-example-data/iris.parquet", |
| 1184 | ... schema=pa.schema(fields)) |
| 1185 | >>> ds.schema() |
no test coverage detected
searching dependent graphs…