Calculate a class-based TF-IDF where m is the number of total documents. Arguments: documents_per_topic: The joined documents per topic such that each topic has a single string made out of multiple documents m: The total number of doc
(
self,
documents_per_topic: pd.DataFrame,
fit: bool = True,
partial_fit: bool = False,
)
| 4396 | self.topic_embeddings_ = np.array(topic_embeddings) |
| 4397 | |
| 4398 | def _c_tf_idf( |
| 4399 | self, |
| 4400 | documents_per_topic: pd.DataFrame, |
| 4401 | fit: bool = True, |
| 4402 | partial_fit: bool = False, |
| 4403 | ) -> Tuple[csr_matrix, List[str]]: |
| 4404 | """Calculate a class-based TF-IDF where m is the number of total documents. |
| 4405 | |
| 4406 | Arguments: |
| 4407 | documents_per_topic: The joined documents per topic such that each topic has a single |
| 4408 | string made out of multiple documents |
| 4409 | m: The total number of documents (unjoined) |
| 4410 | fit: Whether to fit a new vectorizer or use the fitted self.vectorizer_model |
| 4411 | partial_fit: Whether to run `partial_fit` for online learning |
| 4412 | |
| 4413 | Returns: |
| 4414 | tf_idf: The resulting matrix giving a value (importance score) for each word per topic |
| 4415 | words: The names of the words to which values were given |
| 4416 | """ |
| 4417 | documents = self._preprocess_text(documents_per_topic.Document.values) |
| 4418 | |
| 4419 | if partial_fit: |
| 4420 | X = self.vectorizer_model.partial_fit(documents).update_bow(documents) |
| 4421 | elif fit: |
| 4422 | X = self.vectorizer_model.fit_transform(documents) |
| 4423 | else: |
| 4424 | X = self.vectorizer_model.transform(documents) |
| 4425 | |
| 4426 | # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0 |
| 4427 | # and will be removed in 1.2. Please use get_feature_names_out instead. |
| 4428 | if version.parse(sklearn_version) >= version.parse("1.0.0"): |
| 4429 | words = self.vectorizer_model.get_feature_names_out() |
| 4430 | else: |
| 4431 | words = self.vectorizer_model.get_feature_names() |
| 4432 | |
| 4433 | multiplier = None |
| 4434 | if self.ctfidf_model.seed_words and self.seed_topic_list: |
| 4435 | seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds] |
| 4436 | multiplier = np.array( |
| 4437 | [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words] |
| 4438 | ) |
| 4439 | multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier, words)]) |
| 4440 | elif self.ctfidf_model.seed_words: |
| 4441 | multiplier = np.array( |
| 4442 | [self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words] |
| 4443 | ) |
| 4444 | elif self.seed_topic_list: |
| 4445 | seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds] |
| 4446 | multiplier = np.array([1.2 if word in seed_topic_list else 1 for word in words]) |
| 4447 | |
| 4448 | if fit: |
| 4449 | self.ctfidf_model = self.ctfidf_model.fit(X, multiplier=multiplier) |
| 4450 | |
| 4451 | c_tf_idf = self.ctfidf_model.transform(X) |
| 4452 | |
| 4453 | return c_tf_idf, words |
| 4454 | |
| 4455 | def _update_topic_size(self, documents: pd.DataFrame): |
no test coverage detected