Reduce the number of topics automatically using HDBSCAN. Arguments: documents: Dataframe with documents and their corresponding IDs and Topics use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the
(self, documents: pd.DataFrame, use_ctfidf: bool = False)
| 4653 | return documents |
| 4654 | |
| 4655 | def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame: |
| 4656 | """Reduce the number of topics automatically using HDBSCAN. |
| 4657 | |
| 4658 | Arguments: |
| 4659 | documents: Dataframe with documents and their corresponding IDs and Topics |
| 4660 | use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the |
| 4661 | embeddings from the embedding model are used. |
| 4662 | |
| 4663 | Returns: |
| 4664 | documents: Updated dataframe with documents and the reduced number of Topics |
| 4665 | """ |
| 4666 | topics = documents.Topic.tolist().copy() |
| 4667 | unique_topics = sorted(list(documents.Topic.unique()))[self._outliers :] |
| 4668 | |
| 4669 | # Find similar topics |
| 4670 | embeddings = select_topic_representation( |
| 4671 | self.c_tf_idf_, self.topic_embeddings_, use_ctfidf, output_ndarray=True |
| 4672 | )[0] |
| 4673 | norm_data = normalize(embeddings, norm="l2") |
| 4674 | |
| 4675 | if HAS_HDBSCAN: |
| 4676 | predictions = HDBSCAN( |
| 4677 | min_cluster_size=2, |
| 4678 | metric="euclidean", |
| 4679 | cluster_selection_method="eom", |
| 4680 | prediction_data=True, |
| 4681 | ).fit_predict(norm_data[self._outliers :]) |
| 4682 | else: |
| 4683 | predictions = SK_HDBSCAN( |
| 4684 | min_cluster_size=2, metric="euclidean", cluster_selection_method="eom", n_jobs=-1 |
| 4685 | ).fit_predict(norm_data[self._outliers :]) |
| 4686 | |
| 4687 | # Map clusters to their lowest topic_id |
| 4688 | cluster_to_lowest = {} |
| 4689 | for cluster, topic_id in zip(predictions, unique_topics): |
| 4690 | if cluster != -1: # Ignore unclustered items |
| 4691 | if cluster not in cluster_to_lowest: |
| 4692 | cluster_to_lowest[cluster] = topic_id |
| 4693 | else: |
| 4694 | cluster_to_lowest[cluster] = min(cluster_to_lowest[cluster], topic_id) |
| 4695 | |
| 4696 | # Map each topic_id to the lowest topic_id in its cluster |
| 4697 | mapped_topics = {} |
| 4698 | for cluster, topic_id in zip(predictions, unique_topics): |
| 4699 | if cluster == -1: |
| 4700 | mapped_topics[topic_id] = topic_id # No clustering, stays the same |
| 4701 | else: |
| 4702 | mapped_topics[topic_id] = cluster_to_lowest[cluster] |
| 4703 | |
| 4704 | documents.Topic = documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int) |
| 4705 | mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, documents.Topic.tolist())} |
| 4706 | |
| 4707 | # Track mappings and sizes of topics for merging topic embeddings |
| 4708 | mappings = defaultdict(list) |
| 4709 | for key, val in sorted(mapped_topics.items()): |
| 4710 | mappings[val].append(key) |
| 4711 | mappings = { |
| 4712 | topic_to: { |
no test coverage detected