MCPcopy Index your code
hub / github.com/ddbourgin/numpy-ml / transform

Method transform

numpy_ml/preprocessing/nlp.py:964–1005  ·  view source on GitHub ↗

Generate the term-frequency inverse-document-frequency encoding of a text corpus. Parameters ---------- ignore_special_chars : bool Whether to drop columns corresponding to " ", " ", and " " tokens from the final tfidf enc

(self, ignore_special_chars=True)

Source from the content-addressed store, hash-verified

962 self.inv_doc_freq = inv_doc_freq
963
964 def transform(self, ignore_special_chars=True):
965 """
966 Generate the term-frequency inverse-document-frequency encoding of a
967 text corpus.
968
969 Parameters
970 ----------
971 ignore_special_chars : bool
972 Whether to drop columns corresponding to "<eol>", "<bol>", and
973 "<unk>" tokens from the final tfidf encoding. Default is True.
974
975 Returns
976 -------
977 tfidf : numpy array of shape `(D, M [- 3])`
978 The encoded corpus, with each row corresponding to a single
979 document, and each column corresponding to a token id. The mapping
980 between column numbers and tokens is stored in the `idx2token`
981 attribute IFF `ignore_special_chars` is False. Otherwise, the
982 mappings are not accurate.
983 """
984 D, N = len(self._idx2doc), len(self._tokens)
985 tf = np.zeros((D, N))
986 idf = np.zeros((D, N))
987
988 for d_ix in self._idx2doc.keys():
989 words, counts = zip(*self.term_freq[d_ix].items())
990 docs = np.ones(len(words), dtype=int) * d_ix
991 tf[docs, words] = counts
992
993 words = sorted(self.idx2token.keys())
994 idf = np.tile(np.array([self.inv_doc_freq[w] for w in words]), (D, 1))
995 tfidf = tf * idf
996
997 if ignore_special_chars:
998 idxs = [
999 self.token2idx["<unk>"],
1000 self.token2idx["<eol>"],
1001 self.token2idx["<bol>"],
1002 ]
1003 tfidf = np.delete(tfidf, idxs, 1)
1004
1005 return tfidf
1006
1007
1008class Vocabulary:

Callers 1

test_tfidfFunction · 0.95

Calls

no outgoing calls

Tested by 1

test_tfidfFunction · 0.76