Rearrange DataFrame into new partitions Uses hashing of `on` to map rows to output partitions. After this operation, rows with the same value of `on` will be in the same partition. Parameters ---------- on : str, list of str, or Series, Index, or Dat
(
self,
on: str | list | no_default = no_default, # type: ignore[valid-type]
ignore_index: bool = False,
npartitions: int | None = None,
shuffle_method: str | None = None,
on_index: bool = False,
force: bool = False,
**options,
)
| 842 | return self.partitions[n] |
| 843 | |
| 844 | def shuffle( |
| 845 | self, |
| 846 | on: str | list | no_default = no_default, # type: ignore[valid-type] |
| 847 | ignore_index: bool = False, |
| 848 | npartitions: int | None = None, |
| 849 | shuffle_method: str | None = None, |
| 850 | on_index: bool = False, |
| 851 | force: bool = False, |
| 852 | **options, |
| 853 | ): |
| 854 | """Rearrange DataFrame into new partitions |
| 855 | |
| 856 | Uses hashing of `on` to map rows to output partitions. After this |
| 857 | operation, rows with the same value of `on` will be in the same |
| 858 | partition. |
| 859 | |
| 860 | Parameters |
| 861 | ---------- |
| 862 | on : str, list of str, or Series, Index, or DataFrame |
| 863 | Column names to shuffle by. |
| 864 | ignore_index : optional |
| 865 | Whether to ignore the index. Default is ``False``. |
| 866 | npartitions : optional |
| 867 | Number of output partitions. The partition count will |
| 868 | be preserved by default. |
| 869 | shuffle_method : optional |
| 870 | Desired shuffle method. Default chosen at optimization time. |
| 871 | on_index : bool, default False |
| 872 | Whether to shuffle on the index. Mutually exclusive with 'on'. |
| 873 | Set this to ``True`` if 'on' is not provided. |
| 874 | force : bool, default False |
| 875 | This forces the optimizer to keep the shuffle even if the final |
| 876 | expression could be further simplified. |
| 877 | **options : optional |
| 878 | Algorithm-specific options. |
| 879 | |
| 880 | Notes |
| 881 | ----- |
| 882 | This does not preserve a meaningful index/partitioning scheme. This |
| 883 | is not deterministic if done in parallel. |
| 884 | |
| 885 | Examples |
| 886 | -------- |
| 887 | >>> df = df.shuffle(df.columns[0]) # doctest: +SKIP |
| 888 | """ |
| 889 | if on is no_default and not on_index: # type: ignore[unreachable] |
| 890 | raise TypeError( |
| 891 | "Must shuffle on either columns or the index; currently shuffling on " |
| 892 | "neither. Pass column(s) to 'on' or set 'on_index' to True." |
| 893 | ) |
| 894 | elif on is not no_default and on_index: |
| 895 | raise TypeError( |
| 896 | "Cannot shuffle on both columns and the index. Do not pass column(s) " |
| 897 | "to 'on' or set 'on_index' to False." |
| 898 | ) |
| 899 | |
| 900 | # Preserve partition count by default |
| 901 | npartitions = npartitions or self.npartitions |