Method train

numpy_ml/tests/test_ngram.py:30–88 · view source on GitHub ↗

(self, corpus_fp, vocab=None, encoding=None)

Source from the content-addressed store, hash-verified

28	}
29
30	def train(self, corpus_fp, vocab=None, encoding=None):
31	N = self.N
32	H = self.hyperparameters
33	models, counts = {}, {}
34	grams = {n: [] for n in range(1, N + 1)}
35	gg = {n: [] for n in range(1, N + 1)}
36	filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]
37
38	n_words = 0
39	tokens = set([])
40
41	with open(corpus_fp, "r", encoding=encoding) as text:
42	for line in text:
43	words = tokenize_words(line, filter_punc, filter_stop)
44
45	if vocab is not None:
46	words = vocab.filter(words, H["unk"])
47
48	if len(words) == 0:
49	continue
50
51	n_words += len(words)
52	tokens.update(words)
53
54	# calculate n, n-1, ... 1-grams
55	for n in range(1, N + 1):
56	grams[n].append(
57	nltk.ngrams(
58	words,
59	n,
60	pad_left=True,
61	pad_right=True,
62	left_pad_symbol="<bol>",
63	right_pad_symbol="<eol>",
64	)
65	)
66
67	gg[n].extend(
68	list(
69	nltk.ngrams(
70	words,
71	n,
72	pad_left=True,
73	pad_right=True,
74	left_pad_symbol="<bol>",
75	right_pad_symbol="<eol>",
76	)
77	)
78	)
79
80	for n in range(1, N + 1):
81	counts[n] = nltk.FreqDist(gg[n])
82	models[n] = nltk.lm.MLE(order=n)
83	models[n].fit(grams[n], tokens)
84
85	self.counts = counts
86	self.n_words = n_words
87	self._models = models

test_mleFunction · 0.95

test_additiveFunction · 0.45

tokenize_wordsFunction · 0.85

filterMethod · 0.80

updateMethod · 0.45

fitMethod · 0.45

no test coverage detected