MCPcopy
hub / github.com/MaartenGr/BERTopic / _cluster_embeddings

Method _cluster_embeddings

bertopic/_bertopic.py:3958–4008  ·  view source on GitHub ↗

Cluster UMAP reduced embeddings with HDBSCAN. Arguments: umap_embeddings: The reduced sentence embeddings with UMAP documents: Dataframe with documents and their corresponding IDs partial_fit: Whether to run `partial_fit` for online learning y

(
        self,
        umap_embeddings: np.ndarray,
        documents: pd.DataFrame,
        partial_fit: bool = False,
        y: np.ndarray = None,
    )

Source from the content-addressed store, hash-verified

3956 return np.nan_to_num(umap_embeddings)
3957
3958 def _cluster_embeddings(
3959 self,
3960 umap_embeddings: np.ndarray,
3961 documents: pd.DataFrame,
3962 partial_fit: bool = False,
3963 y: np.ndarray = None,
3964 ) -> Tuple[pd.DataFrame, np.ndarray]:
3965 """Cluster UMAP reduced embeddings with HDBSCAN.
3966
3967 Arguments:
3968 umap_embeddings: The reduced sentence embeddings with UMAP
3969 documents: Dataframe with documents and their corresponding IDs
3970 partial_fit: Whether to run `partial_fit` for online learning
3971 y: Array of topics to use
3972
3973 Returns:
3974 documents: Updated dataframe with documents and their corresponding IDs
3975 and newly added Topics
3976 probabilities: The distribution of probabilities
3977 """
3978 logger.info("Cluster - Start clustering the reduced embeddings")
3979 if partial_fit:
3980 self.hdbscan_model = self.hdbscan_model.partial_fit(umap_embeddings)
3981 labels = self.hdbscan_model.labels_
3982 documents["Topic"] = labels
3983 self.topics_ = labels
3984 else:
3985 try:
3986 self.hdbscan_model.fit(umap_embeddings, y=y)
3987 except TypeError:
3988 self.hdbscan_model.fit(umap_embeddings)
3989
3990 try:
3991 labels = self.hdbscan_model.labels_
3992 except AttributeError:
3993 labels = y
3994 documents["Topic"] = labels
3995 self._update_topic_size(documents)
3996
3997 # Extract probabilities
3998 probabilities = None
3999 if hasattr(self.hdbscan_model, "probabilities_"):
4000 probabilities = self.hdbscan_model.probabilities_
4001
4002 if self.calculate_probabilities and is_supported_hdbscan(self.hdbscan_model):
4003 probabilities = hdbscan_delegator(self.hdbscan_model, "all_points_membership_vectors")
4004
4005 if not partial_fit:
4006 self.topic_mapper_ = TopicMapper(self.topics_)
4007 logger.info("Cluster - Completed \u2713")
4008 return documents, probabilities
4009
4010 def _zeroshot_topic_modeling(
4011 self, documents: pd.DataFrame, embeddings: np.ndarray

Callers 4

fit_transformMethod · 0.95
partial_fitMethod · 0.95

Calls 7

_update_topic_sizeMethod · 0.95
is_supported_hdbscanFunction · 0.90
hdbscan_delegatorFunction · 0.90
TopicMapperClass · 0.85
infoMethod · 0.80
partial_fitMethod · 0.45
fitMethod · 0.45