Generate the term-frequency inverse-document-frequency encoding of a text corpus. Parameters ---------- ignore_special_chars : bool Whether to drop columns corresponding to " ", " ", and " " tokens from the final tfidf enc
(self, ignore_special_chars=True)
| 962 | self.inv_doc_freq = inv_doc_freq |
| 963 | |
| 964 | def transform(self, ignore_special_chars=True): |
| 965 | """ |
| 966 | Generate the term-frequency inverse-document-frequency encoding of a |
| 967 | text corpus. |
| 968 | |
| 969 | Parameters |
| 970 | ---------- |
| 971 | ignore_special_chars : bool |
| 972 | Whether to drop columns corresponding to "<eol>", "<bol>", and |
| 973 | "<unk>" tokens from the final tfidf encoding. Default is True. |
| 974 | |
| 975 | Returns |
| 976 | ------- |
| 977 | tfidf : numpy array of shape `(D, M [- 3])` |
| 978 | The encoded corpus, with each row corresponding to a single |
| 979 | document, and each column corresponding to a token id. The mapping |
| 980 | between column numbers and tokens is stored in the `idx2token` |
| 981 | attribute IFF `ignore_special_chars` is False. Otherwise, the |
| 982 | mappings are not accurate. |
| 983 | """ |
| 984 | D, N = len(self._idx2doc), len(self._tokens) |
| 985 | tf = np.zeros((D, N)) |
| 986 | idf = np.zeros((D, N)) |
| 987 | |
| 988 | for d_ix in self._idx2doc.keys(): |
| 989 | words, counts = zip(*self.term_freq[d_ix].items()) |
| 990 | docs = np.ones(len(words), dtype=int) * d_ix |
| 991 | tf[docs, words] = counts |
| 992 | |
| 993 | words = sorted(self.idx2token.keys()) |
| 994 | idf = np.tile(np.array([self.inv_doc_freq[w] for w in words]), (D, 1)) |
| 995 | tfidf = tf * idf |
| 996 | |
| 997 | if ignore_special_chars: |
| 998 | idxs = [ |
| 999 | self.token2idx["<unk>"], |
| 1000 | self.token2idx["<eol>"], |
| 1001 | self.token2idx["<bol>"], |
| 1002 | ] |
| 1003 | tfidf = np.delete(tfidf, idxs, 1) |
| 1004 | |
| 1005 | return tfidf |
| 1006 | |
| 1007 | |
| 1008 | class Vocabulary: |