Creates a :class:`~ray.data.Dataset` from CSV files. Examples: Read a file in remote storage. >>> import ray >>> ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv") >>> ds.schema() Column Type ------ ----
(
paths: Union[str, List[str]],
*,
filesystem: Optional["pyarrow.fs.FileSystem"] = None,
parallelism: int = -1,
num_cpus: Optional[float] = None,
num_gpus: Optional[float] = None,
memory: Optional[float] = None,
ray_remote_args: Dict[str, Any] = None,
arrow_open_stream_args: Optional[Dict[str, Any]] = None,
partition_filter: Optional[PathPartitionFilter] = None,
partitioning: Partitioning = Partitioning("hive"),
include_paths: bool = False,
ignore_missing_paths: bool = False,
shuffle: Optional[Union[Literal["files"], FileShuffleConfig]] = None,
file_extensions: Optional[List[str]] = None,
concurrency: Optional[int] = None,
override_num_blocks: Optional[int] = None,
**arrow_csv_args,
)
| 1784 | |
| 1785 | @PublicAPI |
| 1786 | def read_csv( |
| 1787 | paths: Union[str, List[str]], |
| 1788 | *, |
| 1789 | filesystem: Optional["pyarrow.fs.FileSystem"] = None, |
| 1790 | parallelism: int = -1, |
| 1791 | num_cpus: Optional[float] = None, |
| 1792 | num_gpus: Optional[float] = None, |
| 1793 | memory: Optional[float] = None, |
| 1794 | ray_remote_args: Dict[str, Any] = None, |
| 1795 | arrow_open_stream_args: Optional[Dict[str, Any]] = None, |
| 1796 | partition_filter: Optional[PathPartitionFilter] = None, |
| 1797 | partitioning: Partitioning = Partitioning("hive"), |
| 1798 | include_paths: bool = False, |
| 1799 | ignore_missing_paths: bool = False, |
| 1800 | shuffle: Optional[Union[Literal["files"], FileShuffleConfig]] = None, |
| 1801 | file_extensions: Optional[List[str]] = None, |
| 1802 | concurrency: Optional[int] = None, |
| 1803 | override_num_blocks: Optional[int] = None, |
| 1804 | **arrow_csv_args, |
| 1805 | ) -> Dataset: |
| 1806 | """Creates a :class:`~ray.data.Dataset` from CSV files. |
| 1807 | |
| 1808 | Examples: |
| 1809 | Read a file in remote storage. |
| 1810 | |
| 1811 | >>> import ray |
| 1812 | >>> ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv") |
| 1813 | >>> ds.schema() |
| 1814 | Column Type |
| 1815 | ------ ---- |
| 1816 | sepal length (cm) double |
| 1817 | sepal width (cm) double |
| 1818 | petal length (cm) double |
| 1819 | petal width (cm) double |
| 1820 | target int64 |
| 1821 | |
| 1822 | Read multiple local files. |
| 1823 | |
| 1824 | >>> ray.data.read_csv( # doctest: +SKIP |
| 1825 | ... ["local:///path/to/file1", "local:///path/to/file2"]) |
| 1826 | |
| 1827 | Read a directory from remote storage. |
| 1828 | |
| 1829 | >>> ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris-csv/") |
| 1830 | |
| 1831 | Read files that use a different delimiter. For more uses of ParseOptions see |
| 1832 | https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html # noqa: #501 |
| 1833 | |
| 1834 | >>> from pyarrow import csv |
| 1835 | >>> parse_options = csv.ParseOptions(delimiter="\\t") |
| 1836 | >>> ds = ray.data.read_csv( |
| 1837 | ... "s3://anonymous@ray-example-data/iris.tsv", |
| 1838 | ... parse_options=parse_options) |
| 1839 | >>> ds.schema() |
| 1840 | Column Type |
| 1841 | ------ ---- |
| 1842 | sepal.length double |
| 1843 | sepal.width double |
searching dependent graphs…