Apply Guided Topic Modeling. We transform the seeded topics to embeddings using the same embedder as used for generating document embeddings. Then, we apply cosine similarity between the embeddings and set labels for documents that are more similar to one of
(self, embeddings: np.ndarray)
| 4135 | return documents, embeddings |
| 4136 | |
| 4137 | def _guided_topic_modeling(self, embeddings: np.ndarray) -> Tuple[List[int], np.array]: |
| 4138 | """Apply Guided Topic Modeling. |
| 4139 | |
| 4140 | We transform the seeded topics to embeddings using the |
| 4141 | same embedder as used for generating document embeddings. |
| 4142 | |
| 4143 | Then, we apply cosine similarity between the embeddings |
| 4144 | and set labels for documents that are more similar to |
| 4145 | one of the topics than the average document. |
| 4146 | |
| 4147 | If a document is more similar to the average document |
| 4148 | than any of the topics, it gets the -1 label and is |
| 4149 | thereby not included in UMAP. |
| 4150 | |
| 4151 | Arguments: |
| 4152 | embeddings: The document embeddings |
| 4153 | |
| 4154 | Returns: |
| 4155 | y: The labels for each seeded topic |
| 4156 | embeddings: Updated embeddings |
| 4157 | """ |
| 4158 | logger.info("Guided - Find embeddings highly related to seeded topics.") |
| 4159 | # Create embeddings from the seeded topics |
| 4160 | seed_topic_list = [" ".join(seed_topic) for seed_topic in self.seed_topic_list] |
| 4161 | seed_topic_embeddings = self._extract_embeddings(seed_topic_list, verbose=self.verbose) |
| 4162 | seed_topic_embeddings = np.vstack([seed_topic_embeddings, embeddings.mean(axis=0)]) |
| 4163 | |
| 4164 | # Label documents that are most similar to one of the seeded topics |
| 4165 | sim_matrix = cosine_similarity(embeddings, seed_topic_embeddings) |
| 4166 | y = [np.argmax(sim_matrix[index]) for index in range(sim_matrix.shape[0])] |
| 4167 | y = [val if val != len(seed_topic_list) else -1 for val in y] |
| 4168 | |
| 4169 | # Average the document embeddings related to the seeded topics with the |
| 4170 | # embedding of the seeded topic to force the documents in a cluster |
| 4171 | for seed_topic in range(len(seed_topic_list)): |
| 4172 | indices = [index for index, topic in enumerate(y) if topic == seed_topic] |
| 4173 | embeddings[indices] = embeddings[indices] * 0.75 + seed_topic_embeddings[seed_topic] * 0.25 |
| 4174 | logger.info("Guided - Completed \u2713") |
| 4175 | return y, embeddings |
| 4176 | |
| 4177 | def _extract_topics( |
| 4178 | self, |
no test coverage detected