MCPcopy Index your code
hub / github.com/ddbourgin/numpy-ml / _sort_tokens

Method _sort_tokens

numpy_ml/preprocessing/nlp.py:904–932  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

902 self.term_freq = doc_counts
903
904 def _sort_tokens(self):
905 # sort tokens alphabetically and recode
906 ix = 0
907 token2idx, idx2token, = (
908 {},
909 {},
910 )
911 special = ["<eol>", "<bol>", "<unk>"]
912 words = sorted(self.token2idx.keys())
913 term_freq = {d: {} for d in self.term_freq.keys()}
914
915 for w in words:
916 if w not in special:
917 old_ix = self.token2idx[w]
918 token2idx[w], idx2token[ix] = ix, w
919 for d in self.term_freq.keys():
920 if old_ix in self.term_freq[d]:
921 count = self.term_freq[d][old_ix]
922 term_freq[d][ix] = count
923 ix += 1
924
925 for w in special:
926 token2idx[w] = len(token2idx)
927 idx2token[len(idx2token)] = w
928
929 self.token2idx = token2idx
930 self.idx2token = idx2token
931 self.term_freq = term_freq
932 self.vocab_counts = Counter({t.word: t.count for t in self._tokens})
933
934 def _calc_idf(self):
935 """

Callers 1

fitMethod · 0.95

Calls

no outgoing calls

Tested by

no test coverage detected