MCPcopy
hub / github.com/MaartenGr/BERTopic / delete_topics

Method delete_topics

bertopic/_bertopic.py:2177–2311  ·  view source on GitHub ↗

Delete topics from the topic model. The deleted topics will be mapped to -1 (outlier topic). Core topic attributes like topic embeddings and c-TF-IDF will be automatically updated. Arguments: topics_to_delete: List of topics to delete

(
        self,
        topics_to_delete: List[int],
    )

Source from the content-addressed store, hash-verified

2175 self.probabilities_ = self._map_probabilities(self.probabilities_)
2176
2177 def delete_topics(
2178 self,
2179 topics_to_delete: List[int],
2180 ) -> None:
2181 """Delete topics from the topic model.
2182
2183 The deleted topics will be mapped to -1 (outlier topic). Core topic attributes
2184 like topic embeddings and c-TF-IDF will be automatically updated.
2185
2186 Arguments:
2187 topics_to_delete: List of topics to delete
2188 """
2189 check_is_fitted(self)
2190
2191 topics_df = pd.DataFrame({"Topic": self.topics_})
2192
2193 # Check if -1 exists in the current topics
2194 had_outliers = -1 in set(self.topics_)
2195
2196 # If adding -1 for the first time, initialize its attributes
2197 if not had_outliers and any(topic in topics_to_delete for topic in self.topics_):
2198 # Initialize c-TF-IDF for -1 topic (zeros)
2199 outlier_row = np.zeros((1, self.c_tf_idf_.shape[1]))
2200 outlier_row = sp.csr_matrix(outlier_row)
2201 self.c_tf_idf_ = sp.vstack([outlier_row, self.c_tf_idf_])
2202
2203 # Initialize topic embeddings for -1 topic (zeros)
2204 outlier_embedding = np.zeros((1, self.topic_embeddings_.shape[1]))
2205 self.topic_embeddings_ = np.vstack([outlier_embedding, self.topic_embeddings_])
2206
2207 # Initialize topic representations for -1 topic: ("", 1e-05)
2208 self.topic_representations_[-1] = [("", 1e-05)]
2209
2210 # Initialize representative docs for -1 topic (empty list)
2211 self.representative_docs_[-1] = []
2212
2213 # Initialize representative images for -1 topic if images are being used
2214 if self.representative_images_ is not None:
2215 outlier_image = np.zeros((1, self.representative_images_.shape[1]))
2216 self.representative_images_ = np.vstack([outlier_image, self.representative_images_])
2217
2218 # Initialize custom labels for -1 topic if they exist
2219 if hasattr(self, "custom_labels_") and self.custom_labels_ is not None:
2220 self.custom_labels_[-1] = ""
2221
2222 # Initialize ctfidf model diagonal for -1 topic (ones) if it exists
2223 if hasattr(self, "ctfidf_model") and self.ctfidf_model is not None:
2224 n_features = self.ctfidf_model._idf_diag.shape[1]
2225 outlier_diag = sp.csr_matrix(([1.0], ([0], [0])), shape=(1, n_features))
2226 self.ctfidf_model._idf_diag = sp.vstack([outlier_diag, self.ctfidf_model._idf_diag])
2227
2228 # Initialize topic aspects for -1 topic (empty dict for each aspect) if they exist
2229 if hasattr(self, "topic_aspects_") and self.topic_aspects_ is not None:
2230 for aspect in self.topic_aspects_:
2231 self.topic_aspects_[aspect][-1] = {}
2232
2233 # First map deleted topics to -1
2234 mapping = {topic: -1 if topic in topics_to_delete else topic for topic in set(self.topics_)}

Callers 1

test_deleteFunction · 0.80

Calls 6

_update_topic_sizeMethod · 0.95
_map_probabilitiesMethod · 0.95
check_is_fittedFunction · 0.90
add_mappingsMethod · 0.80
get_mappingsMethod · 0.80

Tested by 1

test_deleteFunction · 0.64