Replace all tokens that occur less than `min_count` with the ` ` token.
(self)
| 863 | assert len(self._tokens) <= N |
| 864 | |
| 865 | def _drop_low_freq_tokens(self): |
| 866 | """ |
| 867 | Replace all tokens that occur less than `min_count` with the `<unk>` |
| 868 | token. |
| 869 | """ |
| 870 | H = self.hyperparameters |
| 871 | unk_token = self._tokens[self.token2idx["<unk>"]] |
| 872 | eol_token = self._tokens[self.token2idx["<eol>"]] |
| 873 | bol_token = self._tokens[self.token2idx["<bol>"]] |
| 874 | tokens = [unk_token, eol_token, bol_token] |
| 875 | |
| 876 | unk_idx = 0 |
| 877 | word2idx = {"<unk>": 0, "<eol>": 1, "<bol>": 2} |
| 878 | idx2word = {0: "<unk>", 1: "<eol>", 2: "<bol>"} |
| 879 | special = {"<eol>", "<bol>", "<unk>"} |
| 880 | |
| 881 | for tt in self._tokens: |
| 882 | if tt.word not in special: |
| 883 | if tt.count < H["min_count"]: |
| 884 | tokens[unk_idx].count += tt.count |
| 885 | else: |
| 886 | word2idx[tt.word] = len(tokens) |
| 887 | idx2word[len(tokens)] = tt.word |
| 888 | tokens.append(tt) |
| 889 | |
| 890 | # reindex document counts |
| 891 | doc_counts = {} |
| 892 | for d_idx in self.term_freq.keys(): |
| 893 | doc_counts[d_idx] = {} |
| 894 | for old_idx, d_count in self.term_freq[d_idx].items(): |
| 895 | word = self.idx2token[old_idx] |
| 896 | new_idx = word2idx.get(word, unk_idx) |
| 897 | doc_counts[d_idx][new_idx] = doc_counts[d_idx].get(new_idx, 0) + d_count |
| 898 | |
| 899 | self._tokens = tokens |
| 900 | self.token2idx = word2idx |
| 901 | self.idx2token = idx2word |
| 902 | self.term_freq = doc_counts |
| 903 | |
| 904 | def _sort_tokens(self): |
| 905 | # sort tokens alphabetically and recode |