MCPcopy
hub / github.com/MaartenGr/BERTopic / _c_tf_idf

Method _c_tf_idf

bertopic/_bertopic.py:4398–4453  ·  view source on GitHub ↗

Calculate a class-based TF-IDF where m is the number of total documents. Arguments: documents_per_topic: The joined documents per topic such that each topic has a single string made out of multiple documents m: The total number of doc

(
        self,
        documents_per_topic: pd.DataFrame,
        fit: bool = True,
        partial_fit: bool = False,
    )

Source from the content-addressed store, hash-verified

4396 self.topic_embeddings_ = np.array(topic_embeddings)
4397
4398 def _c_tf_idf(
4399 self,
4400 documents_per_topic: pd.DataFrame,
4401 fit: bool = True,
4402 partial_fit: bool = False,
4403 ) -> Tuple[csr_matrix, List[str]]:
4404 """Calculate a class-based TF-IDF where m is the number of total documents.
4405
4406 Arguments:
4407 documents_per_topic: The joined documents per topic such that each topic has a single
4408 string made out of multiple documents
4409 m: The total number of documents (unjoined)
4410 fit: Whether to fit a new vectorizer or use the fitted self.vectorizer_model
4411 partial_fit: Whether to run `partial_fit` for online learning
4412
4413 Returns:
4414 tf_idf: The resulting matrix giving a value (importance score) for each word per topic
4415 words: The names of the words to which values were given
4416 """
4417 documents = self._preprocess_text(documents_per_topic.Document.values)
4418
4419 if partial_fit:
4420 X = self.vectorizer_model.partial_fit(documents).update_bow(documents)
4421 elif fit:
4422 X = self.vectorizer_model.fit_transform(documents)
4423 else:
4424 X = self.vectorizer_model.transform(documents)
4425
4426 # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
4427 # and will be removed in 1.2. Please use get_feature_names_out instead.
4428 if version.parse(sklearn_version) >= version.parse("1.0.0"):
4429 words = self.vectorizer_model.get_feature_names_out()
4430 else:
4431 words = self.vectorizer_model.get_feature_names()
4432
4433 multiplier = None
4434 if self.ctfidf_model.seed_words and self.seed_topic_list:
4435 seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]
4436 multiplier = np.array(
4437 [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]
4438 )
4439 multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier, words)])
4440 elif self.ctfidf_model.seed_words:
4441 multiplier = np.array(
4442 [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]
4443 )
4444 elif self.seed_topic_list:
4445 seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]
4446 multiplier = np.array([1.2 if word in seed_topic_list else 1 for word in words])
4447
4448 if fit:
4449 self.ctfidf_model = self.ctfidf_model.fit(X, multiplier=multiplier)
4450
4451 c_tf_idf = self.ctfidf_model.transform(X)
4452
4453 return c_tf_idf, words
4454
4455 def _update_topic_size(self, documents: pd.DataFrame):

Callers 5

partial_fitMethod · 0.95
topics_over_timeMethod · 0.95
topics_per_classMethod · 0.95
update_topicsMethod · 0.95
_extract_topicsMethod · 0.95

Calls 6

_preprocess_textMethod · 0.95
update_bowMethod · 0.80
fit_transformMethod · 0.80
partial_fitMethod · 0.45
transformMethod · 0.45
fitMethod · 0.45

Tested by

no test coverage detected