MCPcopy Index your code
hub / github.com/ddbourgin/numpy-ml / _drop_low_freq_tokens

Method _drop_low_freq_tokens

numpy_ml/preprocessing/nlp.py:865–902  ·  view source on GitHub ↗

Replace all tokens that occur less than `min_count` with the ` ` token.

(self)

Source from the content-addressed store, hash-verified

863 assert len(self._tokens) <= N
864
865 def _drop_low_freq_tokens(self):
866 """
867 Replace all tokens that occur less than `min_count` with the `<unk>`
868 token.
869 """
870 H = self.hyperparameters
871 unk_token = self._tokens[self.token2idx["<unk>"]]
872 eol_token = self._tokens[self.token2idx["<eol>"]]
873 bol_token = self._tokens[self.token2idx["<bol>"]]
874 tokens = [unk_token, eol_token, bol_token]
875
876 unk_idx = 0
877 word2idx = {"<unk>": 0, "<eol>": 1, "<bol>": 2}
878 idx2word = {0: "<unk>", 1: "<eol>", 2: "<bol>"}
879 special = {"<eol>", "<bol>", "<unk>"}
880
881 for tt in self._tokens:
882 if tt.word not in special:
883 if tt.count < H["min_count"]:
884 tokens[unk_idx].count += tt.count
885 else:
886 word2idx[tt.word] = len(tokens)
887 idx2word[len(tokens)] = tt.word
888 tokens.append(tt)
889
890 # reindex document counts
891 doc_counts = {}
892 for d_idx in self.term_freq.keys():
893 doc_counts[d_idx] = {}
894 for old_idx, d_count in self.term_freq[d_idx].items():
895 word = self.idx2token[old_idx]
896 new_idx = word2idx.get(word, unk_idx)
897 doc_counts[d_idx][new_idx] = doc_counts[d_idx].get(new_idx, 0) + d_count
898
899 self._tokens = tokens
900 self.token2idx = word2idx
901 self.idx2token = idx2word
902 self.term_freq = doc_counts
903
904 def _sort_tokens(self):
905 # sort tokens alphabetically and recode

Callers 1

fitMethod · 0.95

Calls

no outgoing calls

Tested by

no test coverage detected