MCPcopy
hub / github.com/MaartenGr/BERTopic / test_ctfidf

Function test_ctfidf

tests/test_vectorizers/test_ctfidf.py:23–54  ·  view source on GitHub ↗
(model, documents, request)

Source from the content-addressed store, hash-verified

21 ],
22)
23def test_ctfidf(model, documents, request):
24 topic_model = copy.deepcopy(request.getfixturevalue(model))
25 topics = topic_model.topics_
26 documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics})
27 documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
28 documents = topic_model._preprocess_text(documents_per_topic.Document.values)
29 count = topic_model.vectorizer_model.fit(documents)
30
31 # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0
32 # and will be removed in 1.2. Please use get_feature_names_out instead.
33 if version.parse(sklearn_version) >= version.parse("1.0.0"):
34 words = count.get_feature_names_out()
35 else:
36 words = count.get_feature_names()
37
38 X = count.transform(documents)
39 transformer = ClassTfidfTransformer().fit(X)
40 c_tf_idf = transformer.transform(X)
41
42 assert len(words) > 1000
43 assert all([isinstance(x, str) for x in words])
44
45 assert isinstance(X, csr_matrix)
46 assert isinstance(c_tf_idf, csr_matrix)
47
48 assert X.shape[0] == len(set(topics))
49 assert X.shape[1] == len(words)
50
51 assert c_tf_idf.shape[0] == len(set(topics))
52 assert c_tf_idf.shape[1] == len(words)
53
54 assert np.min(X) == 0
55
56
57@pytest.mark.parametrize(

Callers

nothing calls this directly

Calls 4

_preprocess_textMethod · 0.80
fitMethod · 0.45
transformMethod · 0.45

Tested by

no test coverage detected