MCPcopy
hub / github.com/ddbourgin/numpy-ml / train

Method train

numpy_ml/tests/test_ngram.py:125–183  ·  view source on GitHub ↗
(self, corpus_fp, vocab=None, encoding=None)

Source from the content-addressed store, hash-verified

123 }
124
125 def train(self, corpus_fp, vocab=None, encoding=None):
126 N = self.N
127 H = self.hyperparameters
128 models, counts = {}, {}
129 grams = {n: [] for n in range(1, N + 1)}
130 gg = {n: [] for n in range(1, N + 1)}
131 filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]
132
133 n_words = 0
134 tokens = set()
135
136 with open(corpus_fp, "r", encoding=encoding) as text:
137 for line in text:
138 words = tokenize_words(line, filter_punc, filter_stop)
139
140 if vocab is not None:
141 words = vocab.filter(words, H["unk"])
142
143 if len(words) == 0:
144 continue
145
146 n_words += len(words)
147 tokens.update(words)
148
149 # calculate n, n-1, ... 1-grams
150 for n in range(1, N + 1):
151 grams[n].append(
152 nltk.ngrams(
153 words,
154 n,
155 pad_left=True,
156 pad_right=True,
157 left_pad_symbol="<bol>",
158 right_pad_symbol="<eol>",
159 )
160 )
161
162 gg[n].extend(
163 list(
164 nltk.ngrams(
165 words,
166 n,
167 pad_left=True,
168 pad_right=True,
169 left_pad_symbol="<bol>",
170 right_pad_symbol="<eol>",
171 )
172 )
173 )
174
175 for n in range(1, N + 1):
176 counts[n] = nltk.FreqDist(gg[n])
177 models[n] = nltk.lm.Lidstone(order=n, gamma=self.K)
178 models[n].fit(grams[n], tokens)
179
180 self.counts = counts
181 self._models = models
182 self.n_words = n_words

Callers 1

test_additiveFunction · 0.95

Calls 4

tokenize_wordsFunction · 0.85
filterMethod · 0.80
updateMethod · 0.45
fitMethod · 0.45

Tested by

no test coverage detected