MCPcopy Index your code
hub / github.com/ddbourgin/numpy-ml / _encode_document

Method _encode_document

numpy_ml/preprocessing/nlp.py:769–819  ·  view source on GitHub ↗

Perform tokenization and compute token counts for a single document

(
        self, doc, word2idx, idx2word, tokens, doc_count, bol_ix, eol_ix,
    )

Source from the content-addressed store, hash-verified

767 return self
768
769 def _encode_document(
770 self, doc, word2idx, idx2word, tokens, doc_count, bol_ix, eol_ix,
771 ):
772 """Perform tokenization and compute token counts for a single document"""
773 H = self.hyperparameters
774 lowercase = H["lowercase"]
775 filter_stop = H["filter_stopwords"]
776 filter_punc = H["filter_punctuation"]
777
778 if H["input_type"] == "files":
779 with open(doc, "r", encoding=H["encoding"]) as handle:
780 doc = handle.read()
781
782 tokenizer_dict = {
783 "words": tokenize_words,
784 "characters": tokenize_chars,
785 "whitespace": tokenize_whitespace,
786 "bytes": tokenize_bytes_raw,
787 }
788 tokenizer = tokenizer_dict[H["tokenizer"]]
789
790 n_words = 0
791 lines = doc.split("\n")
792 for line in lines:
793 words = tokenizer(
794 line,
795 lowercase=lowercase,
796 filter_stopwords=filter_stop,
797 filter_punctuation=filter_punc,
798 encoding=H["encoding"],
799 )
800 words = self._filter_vocab(words)
801 n_words += len(words)
802
803 for ww in words:
804 if ww not in word2idx:
805 word2idx[ww] = len(tokens)
806 idx2word[len(tokens)] = ww
807 tokens.append(Token(ww))
808
809 t_idx = word2idx[ww]
810 tokens[t_idx].count += 1
811 doc_count[t_idx] = doc_count.get(t_idx, 0) + 1
812
813 # wrap line in <bol> and <eol> tags
814 tokens[bol_ix].count += 1
815 tokens[eol_ix].count += 1
816
817 doc_count[bol_ix] = doc_count.get(bol_ix, 0) + 1
818 doc_count[eol_ix] = doc_count.get(eol_ix, 0) + 1
819 return word2idx, idx2word, tokens, doc_count
820
821 def _keep_top_n_tokens(self):
822 N = self.hyperparameters["max_tokens"]

Callers 1

fitMethod · 0.95

Calls 1

TokenClass · 0.85

Tested by

no test coverage detected