(N=15)
| 77 | |
| 78 | |
| 79 | def test_tfidf(N=15): |
| 80 | np.random.seed(12345) |
| 81 | |
| 82 | i = 0 |
| 83 | while i < N: |
| 84 | docs = [] |
| 85 | n_docs = np.random.randint(1, 10) |
| 86 | for d in range(n_docs): |
| 87 | n_lines = np.random.randint(1, 1000) |
| 88 | lines = [random_paragraph(np.random.randint(1, 10)) for _ in range(n_lines)] |
| 89 | docs.append("\n".join([" ".join(l) for l in lines])) |
| 90 | |
| 91 | smooth = bool(np.random.randint(2)) |
| 92 | |
| 93 | tfidf = TFIDFEncoder( |
| 94 | lowercase=True, |
| 95 | min_count=0, |
| 96 | smooth_idf=smooth, |
| 97 | max_tokens=None, |
| 98 | input_type="strings", |
| 99 | filter_stopwords=False, |
| 100 | ) |
| 101 | gold = TfidfVectorizer( |
| 102 | input="content", |
| 103 | norm=None, |
| 104 | use_idf=True, |
| 105 | lowercase=True, |
| 106 | smooth_idf=smooth, |
| 107 | sublinear_tf=False, |
| 108 | ) |
| 109 | |
| 110 | tfidf.fit(docs) |
| 111 | mine = tfidf.transform(ignore_special_chars=True) |
| 112 | theirs = gold.fit_transform(docs).toarray() |
| 113 | |
| 114 | np.testing.assert_almost_equal(mine, theirs) |
| 115 | print("PASSED") |
| 116 | i += 1 |
| 117 | |
| 118 | |
| 119 | def test_dct(N=15): |
nothing calls this directly
no test coverage detected