Method _sort_tokens

numpy_ml/preprocessing/nlp.py:904–932 · view source on GitHub ↗

(self)

Source from the content-addressed store, hash-verified

902	self.term_freq = doc_counts
903
904	def _sort_tokens(self):
905	# sort tokens alphabetically and recode
906	ix = 0
907	token2idx, idx2token, = (
908	{},
909	{},
910	)
911	special = ["<eol>", "<bol>", "<unk>"]
912	words = sorted(self.token2idx.keys())
913	term_freq = {d: {} for d in self.term_freq.keys()}
914
915	for w in words:
916	if w not in special:
917	old_ix = self.token2idx[w]
918	token2idx[w], idx2token[ix] = ix, w
919	for d in self.term_freq.keys():
920	if old_ix in self.term_freq[d]:
921	count = self.term_freq[d][old_ix]
922	term_freq[d][ix] = count
923	ix += 1
924
925	for w in special:
926	token2idx[w] = len(token2idx)
927	idx2token[len(idx2token)] = w
928
929	self.token2idx = token2idx
930	self.idx2token = idx2token
931	self.term_freq = term_freq
932	self.vocab_counts = Counter({t.word: t.count for t in self._tokens})
933
934	def _calc_idf(self):
935	"""

fitMethod · 0.95

no outgoing calls

no test coverage detected