hub / github.com/MaartenGr/BERTopic / _c_tf_idf

Method _c_tf_idf

bertopic/_bertopic.py:4398–4453 · view source on GitHub ↗

Calculate a class-based TF-IDF where m is the number of total documents. Arguments: documents_per_topic: The joined documents per topic such that each topic has a single string made out of multiple documents m: The total number of doc

(
        self,
        documents_per_topic: pd.DataFrame,
        fit: bool = True,
        partial_fit: bool = False,
    )

Source from the content-addressed store, hash-verified

4396	self.topic_embeddings_ = np.array(topic_embeddings)
4397
4398	def _c_tf_idf(
4399	self,
4400	documents_per_topic: pd.DataFrame,
4401	fit: bool = True,
4402	partial_fit: bool = False,
4403	) -> Tuple[csr_matrix, List[str]]:
4404	"""Calculate a class-based TF-IDF where m is the number of total documents.
4405
4406	Arguments:
4407	documents_per_topic: The joined documents per topic such that each topic has a single
4408	string made out of multiple documents
4409	m: The total number of documents (unjoined)
4410	fit: Whether to fit a new vectorizer or use the fitted self.vectorizer_model
4411	partial_fit: Whether to run `partial_fit` for online learning
4412
4413	Returns:
4414	tf_idf: The resulting matrix giving a value (importance score) for each word per topic
4415	words: The names of the words to which values were given
4416	"""
4417	documents = self._preprocess_text(documents_per_topic.Document.values)
4418
4419	if partial_fit:
4420	X = self.vectorizer_model.partial_fit(documents).update_bow(documents)
4421	elif fit:
4422	X = self.vectorizer_model.fit_transform(documents)
4423	else:
4424	X = self.vectorizer_model.transform(documents)
4425
4426	# Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
4427	# and will be removed in 1.2. Please use get_feature_names_out instead.
4428	if version.parse(sklearn_version) >= version.parse("1.0.0"):
4429	words = self.vectorizer_model.get_feature_names_out()
4430	else:
4431	words = self.vectorizer_model.get_feature_names()
4432
4433	multiplier = None
4434	if self.ctfidf_model.seed_words and self.seed_topic_list:
4435	seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]
4436	multiplier = np.array(
4437	[self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]
4438	)
4439	multiplier = np.array([1.2 if word in seed_topic_list else value for value, word in zip(multiplier, words)])
4440	elif self.ctfidf_model.seed_words:
4441	multiplier = np.array(
4442	[self.ctfidf_model.seed_multiplier if word in self.ctfidf_model.seed_words else 1 for word in words]
4443	)
4444	elif self.seed_topic_list:
4445	seed_topic_list = [seed for seeds in self.seed_topic_list for seed in seeds]
4446	multiplier = np.array([1.2 if word in seed_topic_list else 1 for word in words])
4447
4448	if fit:
4449	self.ctfidf_model = self.ctfidf_model.fit(X, multiplier=multiplier)
4450
4451	c_tf_idf = self.ctfidf_model.transform(X)
4452
4453	return c_tf_idf, words
4454
4455	def _update_topic_size(self, documents: pd.DataFrame):

Callers 5

partial_fitMethod · 0.95

topics_over_timeMethod · 0.95

topics_per_classMethod · 0.95

update_topicsMethod · 0.95

_extract_topicsMethod · 0.95

Calls 6

_preprocess_textMethod · 0.95

update_bowMethod · 0.80

fit_transformMethod · 0.80

partial_fitMethod · 0.45

transformMethod · 0.45

fitMethod · 0.45

Tested by

no test coverage detected