MCPcopy
hub / github.com/MaartenGr/BERTopic / _create_topic_vectors

Method _create_topic_vectors

bertopic/_bertopic.py:4315–4396  ·  view source on GitHub ↗

Creates embeddings per topics based on their topic representation. As a default, topic vectors (topic embeddings) are created by taking the average of all document embeddings within a topic. If topics are merged, then a weighted average of topic embeddings is taken based on

(
        self,
        documents: pd.DataFrame = None,
        embeddings: np.ndarray = None,
        mappings=None,
    )

Source from the content-addressed store, hash-verified

4313 return repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids
4314
4315 def _create_topic_vectors(
4316 self,
4317 documents: pd.DataFrame = None,
4318 embeddings: np.ndarray = None,
4319 mappings=None,
4320 ):
4321 """Creates embeddings per topics based on their topic representation.
4322
4323 As a default, topic vectors (topic embeddings) are created by taking
4324 the average of all document embeddings within a topic. If topics are
4325 merged, then a weighted average of topic embeddings is taken based on
4326 the initial topic sizes.
4327
4328 For the `.partial_fit` and `.update_topics` method, the average
4329 of all document embeddings is not taken since those are not known.
4330 Instead, the weighted average of the embeddings of the top n words
4331 is taken for each topic. The weighting is done based on the c-TF-IDF
4332 score. This will put more emphasis to words that represent a topic best.
4333 """
4334 # Topic embeddings based on input embeddings
4335 if embeddings is not None and documents is not None:
4336 topic_embeddings = []
4337 topics = documents.sort_values("Topic").Topic.unique()
4338 topic_ids = documents["Topic"].values
4339 doc_ids = documents["ID"].values.astype(int)
4340 for topic in topics:
4341 mask = topic_ids == topic
4342 topic_embeddings.append(embeddings[doc_ids[mask]].mean(axis=0))
4343 self.topic_embeddings_ = np.array(topic_embeddings)
4344
4345 # Topic embeddings when merging topics
4346 elif self.topic_embeddings_ is not None and mappings is not None:
4347 topic_embeddings_dict = {}
4348 for topic_to, topics_from in mappings.items():
4349 topic_ids = topics_from["topics_from"]
4350 topic_sizes = topics_from["topic_sizes"]
4351 if topic_ids:
4352 embds = np.array(self.topic_embeddings_)[np.array(topic_ids) + self._outliers]
4353 topic_embedding = np.average(embds, axis=0, weights=topic_sizes)
4354 topic_embeddings_dict[topic_to] = topic_embedding
4355
4356 # Re-order topic embeddings
4357 topics_to_map = {
4358 topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:]
4359 }
4360 topic_embeddings = {}
4361 for topic, embds in topic_embeddings_dict.items():
4362 topic_embeddings[topics_to_map[topic]] = embds
4363 unique_topics = sorted(list(topic_embeddings.keys()))
4364 self.topic_embeddings_ = np.array([topic_embeddings[topic] for topic in unique_topics])
4365
4366 # Topic embeddings based on keyword representations
4367 elif self.embedding_model is not None and type(self.embedding_model) is not BaseEmbedder:
4368 topic_list = list(self.topic_representations_.keys())
4369 topic_list.sort()
4370
4371 # Only extract top n words
4372 n = len(self.topic_representations_[topic_list[0]])

Callers 4

fit_transformMethod · 0.95
partial_fitMethod · 0.95
update_topicsMethod · 0.95
_extract_topicsMethod · 0.95

Calls 2

get_topicMethod · 0.95
_extract_embeddingsMethod · 0.95

Tested by

no test coverage detected