Method std

python/ray/data/grouped_data.py:555–604 · view source on GitHub ↗

r"""Compute grouped standard deviation aggregation. Examples: >>> import ray >>> ray.data.range(100).groupby("id").std(ddof=0) # doctest: +SKIP >>> ray.data.from_items([ # doctest: +SKIP ... {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP

(
        self,
        on: Union[str, List[str]] = None,
        ddof: int = 1,
        ignore_nulls: bool = True,
    )

Source from the content-addressed store, hash-verified

553
554	@PublicAPI(api_group=CDS_API_GROUP)
555	def std(
556	self,
557	on: Union[str, List[str]] = None,
558	ddof: int = 1,
559	ignore_nulls: bool = True,
560	) -> Dataset:
561	r"""Compute grouped standard deviation aggregation.
562
563	Examples:
564	>>> import ray
565	>>> ray.data.range(100).groupby("id").std(ddof=0) # doctest: +SKIP
566	>>> ray.data.from_items([ # doctest: +SKIP
567	... {"A": i % 3, "B": i, "C": i**2} # doctest: +SKIP
568	... for i in range(100)]) # doctest: +SKIP
569	... .groupby("A") # doctest: +SKIP
570	... .std(["B", "C"]) # doctest: +SKIP
571
572	NOTE: This uses Welford's online method for an accumulator-style
573	computation of the standard deviation. This method was chosen due to
574	it's numerical stability, and it being computable in a single pass.
575	This may give different (but more accurate) results than NumPy, Pandas,
576	and sklearn, which use a less numerically stable two-pass algorithm.
577	See
578	https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
579
580	Args:
581	on: a column name or a list of column names to aggregate.
582	ddof: Delta Degrees of Freedom. The divisor used in calculations
583	is ``N - ddof``, where ``N`` represents the number of elements.
584	ignore_nulls: Whether to ignore null values. If ``True``, null
585	values will be ignored when computing the std; if ``False``,
586	if a null value is encountered, the output will be null.
587	We consider np.nan, None, and pd.NaT to be null values.
588	Default is ``True``.
589
590	Returns:
591	The standard deviation result.
592
593	For different values of ``on``, the return varies:
594
595	- ``on=None``: a dataset containing a groupby key column,
596	``"k"``, and a column-wise std column for each original column in
597	the dataset.
598	- ``on=["col_1", ..., "col_n"]``: a dataset of ``n + 1``
599	columns where the first column is the groupby key and the second
600	through ``n + 1`` columns are the results of the aggregations.
601
602	If groupby key is ``None`` then the key part of return is omitted.
603	"""
604	return self._aggregate_on(Std, on, ignore_nulls=ignore_nulls, ddof=ddof)
605
606
607	def _apply_udf_to_groups(

Callers

nothing calls this directly

Calls 1

_aggregate_onMethod · 0.95

Tested by

no test coverage detected