Reorder mappings by their frequency. For example, if topic 88 was mapped to topic 5 and topic 5 turns out to be the largest topic, then topic 5 will be topic 0. The second largest will be topic 1, etc. If there are no mappings since no reduction of topics
(self, documents: pd.DataFrame)
| 4724 | return documents |
| 4725 | |
| 4726 | def _sort_mappings_by_frequency(self, documents: pd.DataFrame) -> pd.DataFrame: |
| 4727 | """Reorder mappings by their frequency. |
| 4728 | |
| 4729 | For example, if topic 88 was mapped to topic |
| 4730 | 5 and topic 5 turns out to be the largest topic, |
| 4731 | then topic 5 will be topic 0. The second largest |
| 4732 | will be topic 1, etc. |
| 4733 | |
| 4734 | If there are no mappings since no reduction of topics |
| 4735 | took place, then the topics will simply be ordered |
| 4736 | by their frequency and will get the topic ids based |
| 4737 | on that order. |
| 4738 | |
| 4739 | This means that -1 will remain the outlier class, and |
| 4740 | that the rest of the topics will be in descending order |
| 4741 | of ids and frequency. |
| 4742 | |
| 4743 | Arguments: |
| 4744 | documents: Dataframe with documents and their corresponding IDs and Topics |
| 4745 | |
| 4746 | Returns: |
| 4747 | documents: Updated dataframe with documents and the mapped |
| 4748 | and re-ordered topic ids |
| 4749 | """ |
| 4750 | # No need to sort if it's the first pass of zero-shot topic modeling |
| 4751 | nr_zeroshot = len(self._topic_id_to_zeroshot_topic_idx) |
| 4752 | if self._is_zeroshot and not self.nr_topics and nr_zeroshot > 0: |
| 4753 | return documents |
| 4754 | |
| 4755 | # Map topics based on frequency |
| 4756 | self._update_topic_size(documents) |
| 4757 | df = pd.DataFrame(self.topic_sizes_.items(), columns=["Old_Topic", "Size"]).sort_values("Size", ascending=False) |
| 4758 | df = df[df.Old_Topic != -1] |
| 4759 | sorted_topics = {**{-1: -1}, **dict(zip(df.Old_Topic, range(len(df))))} |
| 4760 | self.topic_mapper_.add_mappings(sorted_topics, topic_model=self) |
| 4761 | |
| 4762 | # Map documents |
| 4763 | documents.Topic = documents.Topic.map(sorted_topics).fillna(documents.Topic).astype(int) |
| 4764 | self._update_topic_size(documents) |
| 4765 | return documents |
| 4766 | |
| 4767 | def _map_probabilities( |
| 4768 | self, probabilities: Union[np.ndarray, None], original_topics: bool = False |