| 902 | self.term_freq = doc_counts |
| 903 | |
| 904 | def _sort_tokens(self): |
| 905 | # sort tokens alphabetically and recode |
| 906 | ix = 0 |
| 907 | token2idx, idx2token, = ( |
| 908 | {}, |
| 909 | {}, |
| 910 | ) |
| 911 | special = ["<eol>", "<bol>", "<unk>"] |
| 912 | words = sorted(self.token2idx.keys()) |
| 913 | term_freq = {d: {} for d in self.term_freq.keys()} |
| 914 | |
| 915 | for w in words: |
| 916 | if w not in special: |
| 917 | old_ix = self.token2idx[w] |
| 918 | token2idx[w], idx2token[ix] = ix, w |
| 919 | for d in self.term_freq.keys(): |
| 920 | if old_ix in self.term_freq[d]: |
| 921 | count = self.term_freq[d][old_ix] |
| 922 | term_freq[d][ix] = count |
| 923 | ix += 1 |
| 924 | |
| 925 | for w in special: |
| 926 | token2idx[w] = len(token2idx) |
| 927 | idx2token[len(idx2token)] = w |
| 928 | |
| 929 | self.token2idx = token2idx |
| 930 | self.idx2token = idx2token |
| 931 | self.term_freq = term_freq |
| 932 | self.vocab_counts = Counter({t.word: t.count for t in self._tokens}) |
| 933 | |
| 934 | def _calc_idf(self): |
| 935 | """ |