Method _drop_low_freq_tokens

numpy_ml/preprocessing/nlp.py:865–902 · view source on GitHub ↗

Replace all tokens that occur less than `min_count` with the ` ` token.

(self)

Source from the content-addressed store, hash-verified

863	assert len(self._tokens) <= N
864
865	def _drop_low_freq_tokens(self):
866	"""
867	Replace all tokens that occur less than `min_count` with the `<unk>`
868	token.
869	"""
870	H = self.hyperparameters
871	unk_token = self._tokens[self.token2idx["<unk>"]]
872	eol_token = self._tokens[self.token2idx["<eol>"]]
873	bol_token = self._tokens[self.token2idx["<bol>"]]
874	tokens = [unk_token, eol_token, bol_token]
875
876	unk_idx = 0
877	word2idx = {"<unk>": 0, "<eol>": 1, "<bol>": 2}
878	idx2word = {0: "<unk>", 1: "<eol>", 2: "<bol>"}
879	special = {"<eol>", "<bol>", "<unk>"}
880
881	for tt in self._tokens:
882	if tt.word not in special:
883	if tt.count < H["min_count"]:
884	tokens[unk_idx].count += tt.count
885	else:
886	word2idx[tt.word] = len(tokens)
887	idx2word[len(tokens)] = tt.word
888	tokens.append(tt)
889
890	# reindex document counts
891	doc_counts = {}
892	for d_idx in self.term_freq.keys():
893	doc_counts[d_idx] = {}
894	for old_idx, d_count in self.term_freq[d_idx].items():
895	word = self.idx2token[old_idx]
896	new_idx = word2idx.get(word, unk_idx)
897	doc_counts[d_idx][new_idx] = doc_counts[d_idx].get(new_idx, 0) + d_count
898
899	self._tokens = tokens
900	self.token2idx = word2idx
901	self.idx2token = idx2word
902	self.term_freq = doc_counts
903
904	def _sort_tokens(self):
905	# sort tokens alphabetically and recode

fitMethod · 0.95

no outgoing calls

no test coverage detected