Perform tokenization and compute token counts for a single document
(
self, doc, word2idx, idx2word, tokens, doc_count, bol_ix, eol_ix,
)
| 767 | return self |
| 768 | |
| 769 | def _encode_document( |
| 770 | self, doc, word2idx, idx2word, tokens, doc_count, bol_ix, eol_ix, |
| 771 | ): |
| 772 | """Perform tokenization and compute token counts for a single document""" |
| 773 | H = self.hyperparameters |
| 774 | lowercase = H["lowercase"] |
| 775 | filter_stop = H["filter_stopwords"] |
| 776 | filter_punc = H["filter_punctuation"] |
| 777 | |
| 778 | if H["input_type"] == "files": |
| 779 | with open(doc, "r", encoding=H["encoding"]) as handle: |
| 780 | doc = handle.read() |
| 781 | |
| 782 | tokenizer_dict = { |
| 783 | "words": tokenize_words, |
| 784 | "characters": tokenize_chars, |
| 785 | "whitespace": tokenize_whitespace, |
| 786 | "bytes": tokenize_bytes_raw, |
| 787 | } |
| 788 | tokenizer = tokenizer_dict[H["tokenizer"]] |
| 789 | |
| 790 | n_words = 0 |
| 791 | lines = doc.split("\n") |
| 792 | for line in lines: |
| 793 | words = tokenizer( |
| 794 | line, |
| 795 | lowercase=lowercase, |
| 796 | filter_stopwords=filter_stop, |
| 797 | filter_punctuation=filter_punc, |
| 798 | encoding=H["encoding"], |
| 799 | ) |
| 800 | words = self._filter_vocab(words) |
| 801 | n_words += len(words) |
| 802 | |
| 803 | for ww in words: |
| 804 | if ww not in word2idx: |
| 805 | word2idx[ww] = len(tokens) |
| 806 | idx2word[len(tokens)] = ww |
| 807 | tokens.append(Token(ww)) |
| 808 | |
| 809 | t_idx = word2idx[ww] |
| 810 | tokens[t_idx].count += 1 |
| 811 | doc_count[t_idx] = doc_count.get(t_idx, 0) + 1 |
| 812 | |
| 813 | # wrap line in <bol> and <eol> tags |
| 814 | tokens[bol_ix].count += 1 |
| 815 | tokens[eol_ix].count += 1 |
| 816 | |
| 817 | doc_count[bol_ix] = doc_count.get(bol_ix, 0) + 1 |
| 818 | doc_count[eol_ix] = doc_count.get(eol_ix, 0) + 1 |
| 819 | return word2idx, idx2word, tokens, doc_count |
| 820 | |
| 821 | def _keep_top_n_tokens(self): |
| 822 | N = self.hyperparameters["max_tokens"] |