hub / github.com/MaartenGr/BERTopic / _create_topic_vectors

Method _create_topic_vectors

bertopic/_bertopic.py:4315–4396 · view source on GitHub ↗

Creates embeddings per topics based on their topic representation. As a default, topic vectors (topic embeddings) are created by taking the average of all document embeddings within a topic. If topics are merged, then a weighted average of topic embeddings is taken based on

(
        self,
        documents: pd.DataFrame = None,
        embeddings: np.ndarray = None,
        mappings=None,
    )

Source from the content-addressed store, hash-verified

4313	return repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids
4314
4315	def _create_topic_vectors(
4316	self,
4317	documents: pd.DataFrame = None,
4318	embeddings: np.ndarray = None,
4319	mappings=None,
4320	):
4321	"""Creates embeddings per topics based on their topic representation.
4322
4323	As a default, topic vectors (topic embeddings) are created by taking
4324	the average of all document embeddings within a topic. If topics are
4325	merged, then a weighted average of topic embeddings is taken based on
4326	the initial topic sizes.
4327
4328	For the `.partial_fit` and `.update_topics` method, the average
4329	of all document embeddings is not taken since those are not known.
4330	Instead, the weighted average of the embeddings of the top n words
4331	is taken for each topic. The weighting is done based on the c-TF-IDF
4332	score. This will put more emphasis to words that represent a topic best.
4333	"""
4334	# Topic embeddings based on input embeddings
4335	if embeddings is not None and documents is not None:
4336	topic_embeddings = []
4337	topics = documents.sort_values("Topic").Topic.unique()
4338	topic_ids = documents["Topic"].values
4339	doc_ids = documents["ID"].values.astype(int)
4340	for topic in topics:
4341	mask = topic_ids == topic
4342	topic_embeddings.append(embeddings[doc_ids[mask]].mean(axis=0))
4343	self.topic_embeddings_ = np.array(topic_embeddings)
4344
4345	# Topic embeddings when merging topics
4346	elif self.topic_embeddings_ is not None and mappings is not None:
4347	topic_embeddings_dict = {}
4348	for topic_to, topics_from in mappings.items():
4349	topic_ids = topics_from["topics_from"]
4350	topic_sizes = topics_from["topic_sizes"]
4351	if topic_ids:
4352	embds = np.array(self.topic_embeddings_)[np.array(topic_ids) + self._outliers]
4353	topic_embedding = np.average(embds, axis=0, weights=topic_sizes)
4354	topic_embeddings_dict[topic_to] = topic_embedding
4355
4356	# Re-order topic embeddings
4357	topics_to_map = {
4358	topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:]
4359	}
4360	topic_embeddings = {}
4361	for topic, embds in topic_embeddings_dict.items():
4362	topic_embeddings[topics_to_map[topic]] = embds
4363	unique_topics = sorted(list(topic_embeddings.keys()))
4364	self.topic_embeddings_ = np.array([topic_embeddings[topic] for topic in unique_topics])
4365
4366	# Topic embeddings based on keyword representations
4367	elif self.embedding_model is not None and type(self.embedding_model) is not BaseEmbedder:
4368	topic_list = list(self.topic_representations_.keys())
4369	topic_list.sort()
4370
4371	# Only extract top n words
4372	n = len(self.topic_representations_[topic_list[0]])

Callers 4

fit_transformMethod · 0.95

partial_fitMethod · 0.95

update_topicsMethod · 0.95

_extract_topicsMethod · 0.95

Calls 2

get_topicMethod · 0.95

_extract_embeddingsMethod · 0.95

Tested by

no test coverage detected