hub / github.com/MaartenGr/BERTopic / _zeroshot_topic_modeling

Method _zeroshot_topic_modeling

bertopic/_bertopic.py:4010–4066 · view source on GitHub ↗

Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list. We transform the topics in `self.zeroshot_topic_list` to embeddings and compare them through cosine similarity with the document embeddings. If they pass the `self.zeroshot_min_sim

(
        self, documents: pd.DataFrame, embeddings: np.ndarray
    )

Source from the content-addressed store, hash-verified

4008	return documents, probabilities
4009
4010	def _zeroshot_topic_modeling(
4011	self, documents: pd.DataFrame, embeddings: np.ndarray
4012	) -> Tuple[pd.DataFrame, np.array, pd.DataFrame, np.array]:
4013	"""Find documents that could be assigned to either one of the topics in self.zeroshot_topic_list.
4014
4015	We transform the topics in `self.zeroshot_topic_list` to embeddings and
4016	compare them through cosine similarity with the document embeddings.
4017	If they pass the `self.zeroshot_min_similarity` threshold, they are assigned.
4018
4019	Arguments:
4020	documents: Dataframe with documents and their corresponding IDs
4021	embeddings: The document embeddings
4022
4023	Returns:
4024	documents: The leftover documents that were not assigned to any topic
4025	embeddings: The leftover embeddings that were not assigned to any topic
4026	"""
4027	logger.info("Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics")
4028	# Similarity between document and zero-shot topic embeddings
4029	zeroshot_embeddings = self._extract_embeddings(self.zeroshot_topic_list)
4030	cosine_similarities = cosine_similarity(embeddings, zeroshot_embeddings)
4031	assignment = np.argmax(cosine_similarities, 1)
4032	assignment_vals = np.max(cosine_similarities, 1)
4033	assigned_ids = [index for index, value in enumerate(assignment_vals) if value >= self.zeroshot_min_similarity]
4034	non_assigned_ids = [
4035	index for index, value in enumerate(assignment_vals) if value < self.zeroshot_min_similarity
4036	]
4037
4038	# Assign topics
4039	assigned_documents = documents.iloc[assigned_ids]
4040	assigned_documents["Topic"] = [topic for topic in assignment[assigned_ids]]
4041	assigned_documents["Old_ID"] = assigned_documents["ID"].copy()
4042	assigned_documents["ID"] = range(len(assigned_documents))
4043	assigned_embeddings = embeddings[assigned_ids]
4044
4045	# Check that if a number of topics was specified, it exceeds the number of zeroshot topics matched
4046	num_zeroshot_topics = len(assigned_documents["Topic"].unique())
4047	if self.nr_topics != "auto":
4048	if self.nr_topics and not self.nr_topics > num_zeroshot_topics:
4049	raise ValueError(
4050	f"The set nr_topics ({self.nr_topics}) must exceed the number of matched zero-shot topics "
4051	f"({num_zeroshot_topics}). Consider raising nr_topics or raising the "
4052	f"zeroshot_min_similarity ({self.zeroshot_min_similarity})."
4053	)
4054
4055	# Select non-assigned topics to be clustered
4056	documents = documents.iloc[non_assigned_ids]
4057	documents["Old_ID"] = documents["ID"].copy()
4058	documents["ID"] = range(len(documents))
4059	embeddings = embeddings[non_assigned_ids]
4060
4061	if len(documents) == 0:
4062	self.topics_ = assigned_documents["Topic"].values.tolist()
4063	self.topic_mapper_ = TopicMapper(self.topics_)
4064
4065	logger.info("Zeroshot Step 1 - Completed \u2713")
4066	return documents, embeddings, assigned_documents, assigned_embeddings
4067

Callers 1

fit_transformMethod · 0.95

Calls 3

_extract_embeddingsMethod · 0.95

TopicMapperClass · 0.85

infoMethod · 0.80

Tested by

no test coverage detected