Add a new column to the dataset via an expression. This method allows you to add a new column to a dataset by applying an expression. The expression can be composed of existing columns, literals, and user-defined functions (UDFs). For callable class UDFs, R
(
self,
column_name: str,
expr: Expr,
*,
compute: Optional[ComputeStrategy] = None,
**ray_remote_args,
)
| 871 | |
| 872 | @PublicAPI(api_group=EXPRESSION_API_GROUP, stability="alpha") |
| 873 | def with_column( |
| 874 | self, |
| 875 | column_name: str, |
| 876 | expr: Expr, |
| 877 | *, |
| 878 | compute: Optional[ComputeStrategy] = None, |
| 879 | **ray_remote_args, |
| 880 | ) -> "Dataset": |
| 881 | """ |
| 882 | Add a new column to the dataset via an expression. |
| 883 | |
| 884 | This method allows you to add a new column to a dataset by applying an |
| 885 | expression. The expression can be composed of existing columns, literals, |
| 886 | and user-defined functions (UDFs). |
| 887 | |
| 888 | For callable class UDFs, Ray Data automatically uses actor semantics to maintain |
| 889 | state across batches. You can customize the compute strategy to control parallelism |
| 890 | and resource allocation. |
| 891 | |
| 892 | Examples: |
| 893 | >>> import ray |
| 894 | >>> from ray.data.expressions import col |
| 895 | >>> ds = ray.data.range(100) |
| 896 | >>> # Add a new column 'id_2' by multiplying 'id' by 2. |
| 897 | >>> ds.with_column("id_2", col("id") * 2).show(2) |
| 898 | {'id': 0, 'id_2': 0} |
| 899 | {'id': 1, 'id_2': 2} |
| 900 | |
| 901 | >>> # Using a UDF with with_column |
| 902 | >>> from ray.data.datatype import DataType |
| 903 | >>> from ray.data.expressions import udf |
| 904 | >>> import pyarrow.compute as pc |
| 905 | >>> |
| 906 | >>> @udf(return_dtype=DataType.int32()) |
| 907 | ... def add_one(column): |
| 908 | ... return pc.add(column, 1) |
| 909 | >>> |
| 910 | >>> ds.with_column("id_plus_one", add_one(col("id"))).show(2) |
| 911 | {'id': 0, 'id_plus_one': 1} |
| 912 | {'id': 1, 'id_plus_one': 2} |
| 913 | |
| 914 | >>> # Using a callable class UDF (automatically uses actors) |
| 915 | >>> @udf(return_dtype=DataType.int32()) |
| 916 | ... class AddOffset: |
| 917 | ... def __init__(self, offset): |
| 918 | ... self.offset = offset |
| 919 | ... def __call__(self, x): |
| 920 | ... return pc.add(x, self.offset) |
| 921 | >>> |
| 922 | >>> add_five = AddOffset(5) |
| 923 | >>> ds.with_column("id_plus_five", add_five(col("id"))).show(2) |
| 924 | {'id': 0, 'id_plus_five': 5} |
| 925 | {'id': 1, 'id_plus_five': 6} |
| 926 | |
| 927 | Args: |
| 928 | column_name: The name of the new column. |
| 929 | expr: An expression that defines the new column values. |
| 930 | compute: The compute strategy to use for the projection operation. |