Method _encode_document

numpy_ml/preprocessing/nlp.py:769–819 · view source on GitHub ↗

Perform tokenization and compute token counts for a single document

(
        self, doc, word2idx, idx2word, tokens, doc_count, bol_ix, eol_ix,
    )

Source from the content-addressed store, hash-verified

767	return self
768
769	def _encode_document(
770	self, doc, word2idx, idx2word, tokens, doc_count, bol_ix, eol_ix,
771	):
772	"""Perform tokenization and compute token counts for a single document"""
773	H = self.hyperparameters
774	lowercase = H["lowercase"]
775	filter_stop = H["filter_stopwords"]
776	filter_punc = H["filter_punctuation"]
777
778	if H["input_type"] == "files":
779	with open(doc, "r", encoding=H["encoding"]) as handle:
780	doc = handle.read()
781
782	tokenizer_dict = {
783	"words": tokenize_words,
784	"characters": tokenize_chars,
785	"whitespace": tokenize_whitespace,
786	"bytes": tokenize_bytes_raw,
787	}
788	tokenizer = tokenizer_dict[H["tokenizer"]]
789
790	n_words = 0
791	lines = doc.split("\n")
792	for line in lines:
793	words = tokenizer(
794	line,
795	lowercase=lowercase,
796	filter_stopwords=filter_stop,
797	filter_punctuation=filter_punc,
798	encoding=H["encoding"],
799	)
800	words = self._filter_vocab(words)
801	n_words += len(words)
802
803	for ww in words:
804	if ww not in word2idx:
805	word2idx[ww] = len(tokens)
806	idx2word[len(tokens)] = ww
807	tokens.append(Token(ww))
808
809	t_idx = word2idx[ww]
810	tokens[t_idx].count += 1
811	doc_count[t_idx] = doc_count.get(t_idx, 0) + 1
812
813	# wrap line in <bol> and <eol> tags
814	tokens[bol_ix].count += 1
815	tokens[eol_ix].count += 1
816
817	doc_count[bol_ix] = doc_count.get(bol_ix, 0) + 1
818	doc_count[eol_ix] = doc_count.get(eol_ix, 0) + 1
819	return word2idx, idx2word, tokens, doc_count
820
821	def _keep_top_n_tokens(self):
822	N = self.hyperparameters["max_tokens"]

fitMethod · 0.95

TokenClass · 0.85

no test coverage detected