hub / github.com/MaartenGr/BERTopic / _auto_reduce_topics

Method _auto_reduce_topics

bertopic/_bertopic.py:4655–4724 · view source on GitHub ↗

Reduce the number of topics automatically using HDBSCAN. Arguments: documents: Dataframe with documents and their corresponding IDs and Topics use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the

(self, documents: pd.DataFrame, use_ctfidf: bool = False)

Source from the content-addressed store, hash-verified

4653	return documents
4654
4655	def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:
4656	"""Reduce the number of topics automatically using HDBSCAN.
4657
4658	Arguments:
4659	documents: Dataframe with documents and their corresponding IDs and Topics
4660	use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, the
4661	embeddings from the embedding model are used.
4662
4663	Returns:
4664	documents: Updated dataframe with documents and the reduced number of Topics
4665	"""
4666	topics = documents.Topic.tolist().copy()
4667	unique_topics = sorted(list(documents.Topic.unique()))[self._outliers :]
4668
4669	# Find similar topics
4670	embeddings = select_topic_representation(
4671	self.c_tf_idf_, self.topic_embeddings_, use_ctfidf, output_ndarray=True
4672	)[0]
4673	norm_data = normalize(embeddings, norm="l2")
4674
4675	if HAS_HDBSCAN:
4676	predictions = HDBSCAN(
4677	min_cluster_size=2,
4678	metric="euclidean",
4679	cluster_selection_method="eom",
4680	prediction_data=True,
4681	).fit_predict(norm_data[self._outliers :])
4682	else:
4683	predictions = SK_HDBSCAN(
4684	min_cluster_size=2, metric="euclidean", cluster_selection_method="eom", n_jobs=-1
4685	).fit_predict(norm_data[self._outliers :])
4686
4687	# Map clusters to their lowest topic_id
4688	cluster_to_lowest = {}
4689	for cluster, topic_id in zip(predictions, unique_topics):
4690	if cluster != -1: # Ignore unclustered items
4691	if cluster not in cluster_to_lowest:
4692	cluster_to_lowest[cluster] = topic_id
4693	else:
4694	cluster_to_lowest[cluster] = min(cluster_to_lowest[cluster], topic_id)
4695
4696	# Map each topic_id to the lowest topic_id in its cluster
4697	mapped_topics = {}
4698	for cluster, topic_id in zip(predictions, unique_topics):
4699	if cluster == -1:
4700	mapped_topics[topic_id] = topic_id # No clustering, stays the same
4701	else:
4702	mapped_topics[topic_id] = cluster_to_lowest[cluster]
4703
4704	documents.Topic = documents.Topic.map(mapped_topics).fillna(documents.Topic).astype(int)
4705	mapped_topics = {from_topic: to_topic for from_topic, to_topic in zip(topics, documents.Topic.tolist())}
4706
4707	# Track mappings and sizes of topics for merging topic embeddings
4708	mappings = defaultdict(list)
4709	for key, val in sorted(mapped_topics.items()):
4710	mappings[val].append(key)
4711	mappings = {
4712	topic_to: {

Callers 1

_reduce_topicsMethod · 0.95

Calls 5

_sort_mappings_by_frequencyMethod · 0.95

_extract_topicsMethod · 0.95

_update_topic_sizeMethod · 0.95

select_topic_representationFunction · 0.90

add_mappingsMethod · 0.80

Tested by

no test coverage detected