(model, documents, request)
| 21 | ], |
| 22 | ) |
| 23 | def test_ctfidf(model, documents, request): |
| 24 | topic_model = copy.deepcopy(request.getfixturevalue(model)) |
| 25 | topics = topic_model.topics_ |
| 26 | documents = pd.DataFrame({"Document": documents, "ID": range(len(documents)), "Topic": topics}) |
| 27 | documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join}) |
| 28 | documents = topic_model._preprocess_text(documents_per_topic.Document.values) |
| 29 | count = topic_model.vectorizer_model.fit(documents) |
| 30 | |
| 31 | # Scikit-Learn Deprecation: get_feature_names is deprecated in 1.0 |
| 32 | # and will be removed in 1.2. Please use get_feature_names_out instead. |
| 33 | if version.parse(sklearn_version) >= version.parse("1.0.0"): |
| 34 | words = count.get_feature_names_out() |
| 35 | else: |
| 36 | words = count.get_feature_names() |
| 37 | |
| 38 | X = count.transform(documents) |
| 39 | transformer = ClassTfidfTransformer().fit(X) |
| 40 | c_tf_idf = transformer.transform(X) |
| 41 | |
| 42 | assert len(words) > 1000 |
| 43 | assert all([isinstance(x, str) for x in words]) |
| 44 | |
| 45 | assert isinstance(X, csr_matrix) |
| 46 | assert isinstance(c_tf_idf, csr_matrix) |
| 47 | |
| 48 | assert X.shape[0] == len(set(topics)) |
| 49 | assert X.shape[1] == len(words) |
| 50 | |
| 51 | assert c_tf_idf.shape[0] == len(set(topics)) |
| 52 | assert c_tf_idf.shape[1] == len(words) |
| 53 | |
| 54 | assert np.min(X) == 0 |
| 55 | |
| 56 | |
| 57 | @pytest.mark.parametrize( |
nothing calls this directly
no test coverage detected