MCPcopy
hub / github.com/MaartenGr/BERTopic / _auto_reduce_topics

Method _auto_reduce_topics

bertopic/_bertopic.py:4655–4724  ·  view source on GitHub ↗

Reduce the number of topics automatically using HDBSCAN. Arguments: documents: Dataframe with documents and their corresponding IDs and Topics use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the

(self, documents: pd.DataFrame, use_ctfidf: bool = False)

Source from the content-addressed store, hash-verified

4653 return documents
4654
4655 def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:
4656 """Reduce the number of topics automatically using HDBSCAN.
4657
4658 Arguments:
4659 documents: Dataframe with documents and their corresponding IDs and Topics
4660 use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the
4661 embeddings from the embedding model are used.
4662
4663 Returns:
4664 documents: Updated dataframe with documents and the reduced number of Topics
4665 """
4666 topics = documents.Topic.tolist().copy()
4667 unique_topics = sorted(list(documents.Topic.unique()))[self._outliers :]
4668
4669 # Find similar topics
4670 embeddings = select_topic_representation(
4671 self.c_tf_idf_, self.topic_embeddings_, use_ctfidf, output_ndarray=True
4672 )[0]
4673 norm_data = normalize(embeddings, norm="l2")
4674
4675 if HAS_HDBSCAN:
4676 predictions = HDBSCAN(
4677 min_cluster_size=2,
4678 metric="euclidean",
4679 cluster_selection_method="eom",
4680 prediction_data=True,
4681 ).fit_predict(norm_data[self._outliers :])
4682 else:
4683 predictions = SK_HDBSCAN(
4684 min_cluster_size=2, metric="euclidean", cluster_selection_method="eom", n_jobs=-1
4685 ).fit_predict(norm_data[self._outliers :])
4686
4687 # Map clusters to their lowest topic_id
4688 cluster_to_lowest = {}
4689 for cluster, topic_id in zip(predictions, unique_topics):
4690 if cluster != -1: # Ignore unclustered items
4691 if cluster not in cluster_to_lowest:
4692 cluster_to_lowest[cluster] = topic_id
4693 else:
4694 cluster_to_lowest[cluster] = min(cluster_to_lowest[cluster], topic_id)
4695
4696 # Map each topic_id to the lowest topic_id in its cluster
4697 mapped_topics = {}
4698 for cluster, topic_id in zip(predictions, unique_topics):
4699 if cluster == -1:
4700 mapped_topics[topic_id] = topic_id # No clustering, stays the same
4701 else:
4702 mapped_topics[topic_id] = cluster_to_lowest[cluster]
4703
4704 documents.Topic = documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int)
4705 mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, documents.Topic.tolist())}
4706
4707 # Track mappings and sizes of topics for merging topic embeddings
4708 mappings = defaultdict(list)
4709 for key, val in sorted(mapped_topics.items()):
4710 mappings[val].append(key)
4711 mappings = {
4712 topic_to: {

Callers 1

_reduce_topicsMethod · 0.95

Calls 5

_extract_topicsMethod · 0.95
_update_topic_sizeMethod · 0.95
add_mappingsMethod · 0.80

Tested by

no test coverage detected