Extract topics from the clusters using a class-based TF-IDF. Arguments: documents: Dataframe with documents and their corresponding IDs embeddings: The document embeddings mappings: The mappings from topic to word verbose: Whether to log the p
(
self,
documents: pd.DataFrame,
embeddings: np.ndarray = None,
mappings=None,
verbose: bool = False,
fine_tune_representation: bool = True,
)
| 4175 | return y, embeddings |
| 4176 | |
| 4177 | def _extract_topics( |
| 4178 | self, |
| 4179 | documents: pd.DataFrame, |
| 4180 | embeddings: np.ndarray = None, |
| 4181 | mappings=None, |
| 4182 | verbose: bool = False, |
| 4183 | fine_tune_representation: bool = True, |
| 4184 | ): |
| 4185 | """Extract topics from the clusters using a class-based TF-IDF. |
| 4186 | |
| 4187 | Arguments: |
| 4188 | documents: Dataframe with documents and their corresponding IDs |
| 4189 | embeddings: The document embeddings |
| 4190 | mappings: The mappings from topic to word |
| 4191 | verbose: Whether to log the process of extracting topics |
| 4192 | fine_tune_representation: If True, the topic representation will be fine-tuned using representation models. |
| 4193 | If False, the topic representation will remain as the base c-TF-IDF representation. |
| 4194 | |
| 4195 | Returns: |
| 4196 | c_tf_idf: The resulting matrix giving a value (importance score) for each word per topic |
| 4197 | """ |
| 4198 | if verbose: |
| 4199 | action = "Fine-tuning" if fine_tune_representation else "Extracting" |
| 4200 | method = "representation models" if fine_tune_representation else "c-TF-IDF for topic reduction" |
| 4201 | logger.info(f"Representation - {action} topics using {method}.") |
| 4202 | |
| 4203 | documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) |
| 4204 | self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic) |
| 4205 | self.topic_representations_ = self._extract_words_per_topic( |
| 4206 | words, |
| 4207 | documents, |
| 4208 | fine_tune_representation=fine_tune_representation, |
| 4209 | calculate_aspects=fine_tune_representation, |
| 4210 | embeddings=embeddings, |
| 4211 | ) |
| 4212 | self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings) |
| 4213 | |
| 4214 | if verbose: |
| 4215 | logger.info("Representation - Completed \u2713") |
| 4216 | |
| 4217 | def _save_representative_docs(self, documents: pd.DataFrame): |
| 4218 | """Save the 3 most representative docs per topic. |