r"""Compute grouped standard deviation aggregation. Examples: >>> import ray >>> ray.data.range(100).groupby("id").std(ddof=0) # doctest: +SKIP >>> ray.data.from_items([ # doctest: +SKIP ... {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP
(
self,
on: Union[str, List[str]] = None,
ddof: int = 1,
ignore_nulls: bool = True,
)
| 553 | |
| 554 | @PublicAPI(api_group=CDS_API_GROUP) |
| 555 | def std( |
| 556 | self, |
| 557 | on: Union[str, List[str]] = None, |
| 558 | ddof: int = 1, |
| 559 | ignore_nulls: bool = True, |
| 560 | ) -> Dataset: |
| 561 | r"""Compute grouped standard deviation aggregation. |
| 562 | |
| 563 | Examples: |
| 564 | >>> import ray |
| 565 | >>> ray.data.range(100).groupby("id").std(ddof=0) # doctest: +SKIP |
| 566 | >>> ray.data.from_items([ # doctest: +SKIP |
| 567 | ... {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP |
| 568 | ... for i in range(100)]) # doctest: +SKIP |
| 569 | ... .groupby("A") # doctest: +SKIP |
| 570 | ... .std(["B", "C"]) # doctest: +SKIP |
| 571 | |
| 572 | NOTE: This uses Welford's online method for an accumulator-style |
| 573 | computation of the standard deviation. This method was chosen due to |
| 574 | it's numerical stability, and it being computable in a single pass. |
| 575 | This may give different (but more accurate) results than NumPy, Pandas, |
| 576 | and sklearn, which use a less numerically stable two-pass algorithm. |
| 577 | See |
| 578 | https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm |
| 579 | |
| 580 | Args: |
| 581 | on: a column name or a list of column names to aggregate. |
| 582 | ddof: Delta Degrees of Freedom. The divisor used in calculations |
| 583 | is ``N - ddof``, where ``N`` represents the number of elements. |
| 584 | ignore_nulls: Whether to ignore null values. If ``True``, null |
| 585 | values will be ignored when computing the std; if ``False``, |
| 586 | if a null value is encountered, the output will be null. |
| 587 | We consider np.nan, None, and pd.NaT to be null values. |
| 588 | Default is ``True``. |
| 589 | |
| 590 | Returns: |
| 591 | The standard deviation result. |
| 592 | |
| 593 | For different values of ``on``, the return varies: |
| 594 | |
| 595 | - ``on=None``: a dataset containing a groupby key column, |
| 596 | ``"k"``, and a column-wise std column for each original column in |
| 597 | the dataset. |
| 598 | - ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1`` |
| 599 | columns where the first column is the groupby key and the second |
| 600 | through ``n + 1`` columns are the results of the aggregations. |
| 601 | |
| 602 | If groupby key is ``None`` then the key part of return is omitted. |
| 603 | """ |
| 604 | return self._aggregate_on(Std, on, ignore_nulls=ignore_nulls, ddof=ddof) |
| 605 | |
| 606 | |
| 607 | def _apply_udf_to_groups( |
nothing calls this directly
no test coverage detected