MCPcopy
hub / github.com/MaartenGr/BERTopic / _extract_topics

Method _extract_topics

bertopic/_bertopic.py:4177–4215  ·  view source on GitHub ↗

Extract topics from the clusters using a class-based TF-IDF. Arguments: documents: Dataframe with documents and their corresponding IDs embeddings: The document embeddings mappings: The mappings from topic to word verbose: Whether to log the p

(
        self,
        documents: pd.DataFrame,
        embeddings: np.ndarray = None,
        mappings=None,
        verbose: bool = False,
        fine_tune_representation: bool = True,
    )

Source from the content-addressed store, hash-verified

4175 return y, embeddings
4176
4177 def _extract_topics(
4178 self,
4179 documents: pd.DataFrame,
4180 embeddings: np.ndarray = None,
4181 mappings=None,
4182 verbose: bool = False,
4183 fine_tune_representation: bool = True,
4184 ):
4185 """Extract topics from the clusters using a class-based TF-IDF.
4186
4187 Arguments:
4188 documents: Dataframe with documents and their corresponding IDs
4189 embeddings: The document embeddings
4190 mappings: The mappings from topic to word
4191 verbose: Whether to log the process of extracting topics
4192 fine_tune_representation: If True, the topic representation will be fine-tuned using representation models.
4193 If False, the topic representation will remain as the base c-TF-IDF representation.
4194
4195 Returns:
4196 c_tf_idf: The resulting matrix giving a value (importance score) for each word per topic
4197 """
4198 if verbose:
4199 action = "Fine-tuning" if fine_tune_representation else "Extracting"
4200 method = "representation models" if fine_tune_representation else "c-TF-IDF for topic reduction"
4201 logger.info(f"Representation - {action} topics using {method}.")
4202
4203 documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
4204 self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)
4205 self.topic_representations_ = self._extract_words_per_topic(
4206 words,
4207 documents,
4208 fine_tune_representation=fine_tune_representation,
4209 calculate_aspects=fine_tune_representation,
4210 embeddings=embeddings,
4211 )
4212 self._create_topic_vectors(documents=documents, embeddings=embeddings, mappings=mappings)
4213
4214 if verbose:
4215 logger.info("Representation - Completed \u2713")
4216
4217 def _save_representative_docs(self, documents: pd.DataFrame):
4218 """Save the 3 most representative docs per topic.

Callers 8

fit_transformMethod · 0.95
merge_topicsMethod · 0.95
_reduce_topicsMethod · 0.95
_reduce_to_n_topicsMethod · 0.95
_auto_reduce_topicsMethod · 0.95
test_extract_topicsFunction · 0.80

Calls 4

_c_tf_idfMethod · 0.95
_create_topic_vectorsMethod · 0.95
infoMethod · 0.80

Tested by 3

test_extract_topicsFunction · 0.64