Create a new dataset sorted according to a single or multiple columns. Args: column_names (`Union[str, Sequence[str]]`): Column name(s) to sort by. reverse (`Union[bool, Sequence[bool]]`, defaults to `False`): If `True`, sort by descen
(
self,
column_names: Union[str, Sequence_[str]],
reverse: Union[bool, Sequence_[bool]] = False,
null_placement: str = "at_end",
keep_in_memory: bool = False,
load_from_cache_file: Optional[bool] = None,
indices_cache_file_name: Optional[str] = None,
writer_batch_size: Optional[int] = 1000,
new_fingerprint: Optional[str] = None,
)
| 4704 | @transmit_format |
| 4705 | @fingerprint_transform(inplace=False, ignore_kwargs=["load_from_cache_file", "indices_cache_file_name"]) |
| 4706 | def sort( |
| 4707 | self, |
| 4708 | column_names: Union[str, Sequence_[str]], |
| 4709 | reverse: Union[bool, Sequence_[bool]] = False, |
| 4710 | null_placement: str = "at_end", |
| 4711 | keep_in_memory: bool = False, |
| 4712 | load_from_cache_file: Optional[bool] = None, |
| 4713 | indices_cache_file_name: Optional[str] = None, |
| 4714 | writer_batch_size: Optional[int] = 1000, |
| 4715 | new_fingerprint: Optional[str] = None, |
| 4716 | ) -> "Dataset": |
| 4717 | """Create a new dataset sorted according to a single or multiple columns. |
| 4718 | |
| 4719 | Args: |
| 4720 | column_names (`Union[str, Sequence[str]]`): |
| 4721 | Column name(s) to sort by. |
| 4722 | reverse (`Union[bool, Sequence[bool]]`, defaults to `False`): |
| 4723 | If `True`, sort by descending order rather than ascending. If a single bool is provided, |
| 4724 | the value is applied to the sorting of all column names. Otherwise a list of bools with the |
| 4725 | same length and order as column_names must be provided. |
| 4726 | null_placement (`str`, defaults to `at_end`): |
| 4727 | Put `None` values at the beginning if `at_start` or `first` or at the end if `at_end` or `last` |
| 4728 | |
| 4729 | <Added version="1.14.2"/> |
| 4730 | keep_in_memory (`bool`, defaults to `False`): |
| 4731 | Keep the sorted indices in memory instead of writing it to a cache file. |
| 4732 | load_from_cache_file (`Optional[bool]`, defaults to `True` if caching is enabled): |
| 4733 | If a cache file storing the sorted indices |
| 4734 | can be identified, use it instead of recomputing. |
| 4735 | indices_cache_file_name (`str`, *optional*, defaults to `None`): |
| 4736 | Provide the name of a path for the cache file. It is used to store the |
| 4737 | sorted indices instead of the automatically generated cache file name. |
| 4738 | writer_batch_size (`int`, defaults to `1000`): |
| 4739 | Number of rows per write operation for the cache file writer. |
| 4740 | Higher value gives smaller cache files, lower value consume less temporary memory. |
| 4741 | new_fingerprint (`str`, *optional*, defaults to `None`): |
| 4742 | The new fingerprint of the dataset after transform. |
| 4743 | If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments |
| 4744 | |
| 4745 | Example: |
| 4746 | |
| 4747 | ```py |
| 4748 | >>> from datasets import load_dataset |
| 4749 | >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='validation') |
| 4750 | >>> ds['label'][:10] |
| 4751 | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] |
| 4752 | >>> sorted_ds = ds.sort('label') |
| 4753 | >>> sorted_ds['label'][:10] |
| 4754 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] |
| 4755 | >>> another_sorted_ds = ds.sort(['label', 'text'], reverse=[True, False]) |
| 4756 | >>> another_sorted_ds['label'][:10] |
| 4757 | [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] |
| 4758 | ``` |
| 4759 | """ |
| 4760 | if len(self.list_indexes()) > 0: |
| 4761 | raise DatasetTransformationNotAllowedError( |
| 4762 | "Using `.sort` on a dataset with attached indexes is not allowed. You can first run `.drop_index() to remove your index and then re-add it." |
| 4763 | ) |