Cluster UMAP reduced embeddings with HDBSCAN. Arguments: umap_embeddings: The reduced sentence embeddings with UMAP documents: Dataframe with documents and their corresponding IDs partial_fit: Whether to run `partial_fit` for online learning y
(
self,
umap_embeddings: np.ndarray,
documents: pd.DataFrame,
partial_fit: bool = False,
y: np.ndarray = None,
)
| 3956 | return np.nan_to_num(umap_embeddings) |
| 3957 | |
| 3958 | def _cluster_embeddings( |
| 3959 | self, |
| 3960 | umap_embeddings: np.ndarray, |
| 3961 | documents: pd.DataFrame, |
| 3962 | partial_fit: bool = False, |
| 3963 | y: np.ndarray = None, |
| 3964 | ) -> Tuple[pd.DataFrame, np.ndarray]: |
| 3965 | """Cluster UMAP reduced embeddings with HDBSCAN. |
| 3966 | |
| 3967 | Arguments: |
| 3968 | umap_embeddings: The reduced sentence embeddings with UMAP |
| 3969 | documents: Dataframe with documents and their corresponding IDs |
| 3970 | partial_fit: Whether to run `partial_fit` for online learning |
| 3971 | y: Array of topics to use |
| 3972 | |
| 3973 | Returns: |
| 3974 | documents: Updated dataframe with documents and their corresponding IDs |
| 3975 | and newly added Topics |
| 3976 | probabilities: The distribution of probabilities |
| 3977 | """ |
| 3978 | logger.info("Cluster - Start clustering the reduced embeddings") |
| 3979 | if partial_fit: |
| 3980 | self.hdbscan_model = self.hdbscan_model.partial_fit(umap_embeddings) |
| 3981 | labels = self.hdbscan_model.labels_ |
| 3982 | documents["Topic"] = labels |
| 3983 | self.topics_ = labels |
| 3984 | else: |
| 3985 | try: |
| 3986 | self.hdbscan_model.fit(umap_embeddings, y=y) |
| 3987 | except TypeError: |
| 3988 | self.hdbscan_model.fit(umap_embeddings) |
| 3989 | |
| 3990 | try: |
| 3991 | labels = self.hdbscan_model.labels_ |
| 3992 | except AttributeError: |
| 3993 | labels = y |
| 3994 | documents["Topic"] = labels |
| 3995 | self._update_topic_size(documents) |
| 3996 | |
| 3997 | # Extract probabilities |
| 3998 | probabilities = None |
| 3999 | if hasattr(self.hdbscan_model, "probabilities_"): |
| 4000 | probabilities = self.hdbscan_model.probabilities_ |
| 4001 | |
| 4002 | if self.calculate_probabilities and is_supported_hdbscan(self.hdbscan_model): |
| 4003 | probabilities = hdbscan_delegator(self.hdbscan_model, "all_points_membership_vectors") |
| 4004 | |
| 4005 | if not partial_fit: |
| 4006 | self.topic_mapper_ = TopicMapper(self.topics_) |
| 4007 | logger.info("Cluster - Completed \u2713") |
| 4008 | return documents, probabilities |
| 4009 | |
| 4010 | def _zeroshot_topic_modeling( |
| 4011 | self, documents: pd.DataFrame, embeddings: np.ndarray |