Sort the dataset by a single column. Sorting a parallel dataset requires expensive shuffles and is generally not recommended. See ``set_index`` for implementation details. Parameters ---------- by: str or list[str] Column(s) to sort by. n
(
self,
by: str | list[str],
npartitions: int | None = None,
ascending: bool | list[bool] = True,
na_position: Literal["first", "last"] = "last",
partition_size: float = 128e6,
sort_function: Callable[[pd.DataFrame], pd.DataFrame] | None = None,
sort_function_kwargs: Mapping[str, Any] | None = None,
upsample: float = 1.0,
ignore_index: bool | None = False,
shuffle_method: str | None = None,
**options,
)
| 3532 | ) |
| 3533 | |
| 3534 | def sort_values( |
| 3535 | self, |
| 3536 | by: str | list[str], |
| 3537 | npartitions: int | None = None, |
| 3538 | ascending: bool | list[bool] = True, |
| 3539 | na_position: Literal["first", "last"] = "last", |
| 3540 | partition_size: float = 128e6, |
| 3541 | sort_function: Callable[[pd.DataFrame], pd.DataFrame] | None = None, |
| 3542 | sort_function_kwargs: Mapping[str, Any] | None = None, |
| 3543 | upsample: float = 1.0, |
| 3544 | ignore_index: bool | None = False, |
| 3545 | shuffle_method: str | None = None, |
| 3546 | **options, |
| 3547 | ): |
| 3548 | """Sort the dataset by a single column. |
| 3549 | |
| 3550 | Sorting a parallel dataset requires expensive shuffles and is generally |
| 3551 | not recommended. See ``set_index`` for implementation details. |
| 3552 | |
| 3553 | Parameters |
| 3554 | ---------- |
| 3555 | by: str or list[str] |
| 3556 | Column(s) to sort by. |
| 3557 | npartitions: int, None, or 'auto' |
| 3558 | The ideal number of output partitions. If None, use the same as |
| 3559 | the input. If 'auto' then decide by memory use. |
| 3560 | ascending: bool, optional |
| 3561 | Sort ascending vs. descending. |
| 3562 | Defaults to True. |
| 3563 | na_position: {'last', 'first'}, optional |
| 3564 | Puts NaNs at the beginning if 'first', puts NaN at the end if 'last'. |
| 3565 | Defaults to 'last'. |
| 3566 | sort_function: function, optional |
| 3567 | Sorting function to use when sorting underlying partitions. |
| 3568 | If None, defaults to ``M.sort_values`` (the partition library's |
| 3569 | implementation of ``sort_values``). |
| 3570 | sort_function_kwargs: dict, optional |
| 3571 | Additional keyword arguments to pass to the partition sorting function. |
| 3572 | By default, ``by``, ``ascending``, and ``na_position`` are provided. |
| 3573 | |
| 3574 | Examples |
| 3575 | -------- |
| 3576 | >>> df2 = df.sort_values('x') # doctest: +SKIP |
| 3577 | """ |
| 3578 | if na_position not in ("first", "last"): |
| 3579 | raise ValueError("na_position must be either 'first' or 'last'") |
| 3580 | if not isinstance(by, list): |
| 3581 | by = [by] |
| 3582 | if any(not isinstance(b, str) for b in by): |
| 3583 | raise NotImplementedError( |
| 3584 | "Dataframes only support sorting by named columns which must be passed as a " |
| 3585 | "string or a list of strings.\n" |
| 3586 | f"You passed {by}" |
| 3587 | ) |
| 3588 | |
| 3589 | if not isinstance(ascending, bool) and not len(ascending) == len(by): |
| 3590 | raise ValueError(f"Length of {ascending=} != length of {by=}") |
| 3591 |