Creates embeddings per topics based on their topic representation. As a default, topic vectors (topic embeddings) are created by taking the average of all document embeddings within a topic. If topics are merged, then a weighted average of topic embeddings is taken based on
(
self,
documents: pd.DataFrame = None,
embeddings: np.ndarray = None,
mappings=None,
)
| 4313 | return repr_docs_mappings, repr_docs, repr_docs_indices, repr_docs_ids |
| 4314 | |
| 4315 | def _create_topic_vectors( |
| 4316 | self, |
| 4317 | documents: pd.DataFrame = None, |
| 4318 | embeddings: np.ndarray = None, |
| 4319 | mappings=None, |
| 4320 | ): |
| 4321 | """Creates embeddings per topics based on their topic representation. |
| 4322 | |
| 4323 | As a default, topic vectors (topic embeddings) are created by taking |
| 4324 | the average of all document embeddings within a topic. If topics are |
| 4325 | merged, then a weighted average of topic embeddings is taken based on |
| 4326 | the initial topic sizes. |
| 4327 | |
| 4328 | For the `.partial_fit` and `.update_topics` method, the average |
| 4329 | of all document embeddings is not taken since those are not known. |
| 4330 | Instead, the weighted average of the embeddings of the top n words |
| 4331 | is taken for each topic. The weighting is done based on the c-TF-IDF |
| 4332 | score. This will put more emphasis to words that represent a topic best. |
| 4333 | """ |
| 4334 | # Topic embeddings based on input embeddings |
| 4335 | if embeddings is not None and documents is not None: |
| 4336 | topic_embeddings = [] |
| 4337 | topics = documents.sort_values("Topic").Topic.unique() |
| 4338 | topic_ids = documents["Topic"].values |
| 4339 | doc_ids = documents["ID"].values.astype(int) |
| 4340 | for topic in topics: |
| 4341 | mask = topic_ids == topic |
| 4342 | topic_embeddings.append(embeddings[doc_ids[mask]].mean(axis=0)) |
| 4343 | self.topic_embeddings_ = np.array(topic_embeddings) |
| 4344 | |
| 4345 | # Topic embeddings when merging topics |
| 4346 | elif self.topic_embeddings_ is not None and mappings is not None: |
| 4347 | topic_embeddings_dict = {} |
| 4348 | for topic_to, topics_from in mappings.items(): |
| 4349 | topic_ids = topics_from["topics_from"] |
| 4350 | topic_sizes = topics_from["topic_sizes"] |
| 4351 | if topic_ids: |
| 4352 | embds = np.array(self.topic_embeddings_)[np.array(topic_ids) + self._outliers] |
| 4353 | topic_embedding = np.average(embds, axis=0, weights=topic_sizes) |
| 4354 | topic_embeddings_dict[topic_to] = topic_embedding |
| 4355 | |
| 4356 | # Re-order topic embeddings |
| 4357 | topics_to_map = { |
| 4358 | topic_mapping[0]: topic_mapping[1] for topic_mapping in np.array(self.topic_mapper_.mappings_)[:, -2:] |
| 4359 | } |
| 4360 | topic_embeddings = {} |
| 4361 | for topic, embds in topic_embeddings_dict.items(): |
| 4362 | topic_embeddings[topics_to_map[topic]] = embds |
| 4363 | unique_topics = sorted(list(topic_embeddings.keys())) |
| 4364 | self.topic_embeddings_ = np.array([topic_embeddings[topic] for topic in unique_topics]) |
| 4365 | |
| 4366 | # Topic embeddings based on keyword representations |
| 4367 | elif self.embedding_model is not None and type(self.embedding_model) is not BaseEmbedder: |
| 4368 | topic_list = list(self.topic_representations_.keys()) |
| 4369 | topic_list.sort() |
| 4370 | |
| 4371 | # Only extract top n words |
| 4372 | n = len(self.topic_representations_[topic_list[0]]) |
no test coverage detected