Defines count aggregation. Example: .. testcode:: import ray from ray.data.aggregate import Count ds = ray.data.range(100) # Schema: {'id': int64} ds = ds.add_column( "group_key", lambda batch: batch["id"].as
| 390 | |
| 391 | @PublicAPI |
| 392 | class Count(AggregateFnV2[int, int]): |
| 393 | """Defines count aggregation. |
| 394 | |
| 395 | Example: |
| 396 | |
| 397 | .. testcode:: |
| 398 | |
| 399 | import ray |
| 400 | from ray.data.aggregate import Count |
| 401 | |
| 402 | ds = ray.data.range(100) |
| 403 | # Schema: {'id': int64} |
| 404 | ds = ds.add_column( |
| 405 | "group_key", lambda batch: batch["id"].astype("int64") % 3 |
| 406 | ) |
| 407 | # Schema: {'id': int64, 'group_key': int64} |
| 408 | |
| 409 | # Counting all rows: |
| 410 | result = ds.aggregate(Count()) |
| 411 | # result: {'count()': 100} |
| 412 | |
| 413 | |
| 414 | # Counting all rows per group: |
| 415 | result = ds.groupby("group_key").aggregate(Count(on="id")).take_all() |
| 416 | # result: [{'group_key': 0, 'count(id)': 34}, |
| 417 | # {'group_key': 1, 'count(id)': 33}, |
| 418 | # {'group_key': 2, 'count(id)': 33}] |
| 419 | |
| 420 | |
| 421 | Args: |
| 422 | on: Optional name of the column to count values on. If None, counts rows. |
| 423 | ignore_nulls: Whether to ignore null values when counting. Only applies if |
| 424 | `on` is specified. Default is `False` which means `Count()` on a column |
| 425 | will count nulls by default. To match pandas default behavior of not counting nulls, |
| 426 | set `ignore_nulls=True`. |
| 427 | alias_name: Optional name for the resulting column. |
| 428 | """ |
| 429 | |
| 430 | def __init__( |
| 431 | self, |
| 432 | on: Optional[str] = None, |
| 433 | ignore_nulls: bool = False, |
| 434 | alias_name: Optional[str] = None, |
| 435 | ): |
| 436 | super().__init__( |
| 437 | alias_name if alias_name else f"count({on or ''})", |
| 438 | on=on, |
| 439 | ignore_nulls=ignore_nulls, |
| 440 | zero_factory=lambda: 0, |
| 441 | ) |
| 442 | |
| 443 | def aggregate_block(self, block: Block) -> int: |
| 444 | block_accessor = BlockAccessor.for_block(block) |
| 445 | |
| 446 | if self._target_col_name is None: |
| 447 | # In case of global count, simply fetch number of rows |
| 448 | return block_accessor.num_rows() |
| 449 |
no outgoing calls
searching dependent graphs…