MCPcopy
hub / github.com/ddbourgin/numpy-ml / train

Method train

numpy_ml/tests/test_ngram.py:30–88  ·  view source on GitHub ↗
(self, corpus_fp, vocab=None, encoding=None)

Source from the content-addressed store, hash-verified

28 }
29
30 def train(self, corpus_fp, vocab=None, encoding=None):
31 N = self.N
32 H = self.hyperparameters
33 models, counts = {}, {}
34 grams = {n: [] for n in range(1, N + 1)}
35 gg = {n: [] for n in range(1, N + 1)}
36 filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]
37
38 n_words = 0
39 tokens = set([])
40
41 with open(corpus_fp, "r", encoding=encoding) as text:
42 for line in text:
43 words = tokenize_words(line, filter_punc, filter_stop)
44
45 if vocab is not None:
46 words = vocab.filter(words, H["unk"])
47
48 if len(words) == 0:
49 continue
50
51 n_words += len(words)
52 tokens.update(words)
53
54 # calculate n, n-1, ... 1-grams
55 for n in range(1, N + 1):
56 grams[n].append(
57 nltk.ngrams(
58 words,
59 n,
60 pad_left=True,
61 pad_right=True,
62 left_pad_symbol="<bol>",
63 right_pad_symbol="<eol>",
64 )
65 )
66
67 gg[n].extend(
68 list(
69 nltk.ngrams(
70 words,
71 n,
72 pad_left=True,
73 pad_right=True,
74 left_pad_symbol="<bol>",
75 right_pad_symbol="<eol>",
76 )
77 )
78 )
79
80 for n in range(1, N + 1):
81 counts[n] = nltk.FreqDist(gg[n])
82 models[n] = nltk.lm.MLE(order=n)
83 models[n].fit(grams[n], tokens)
84
85 self.counts = counts
86 self.n_words = n_words
87 self._models = models

Callers 2

test_mleFunction · 0.95
test_additiveFunction · 0.45

Calls 4

tokenize_wordsFunction · 0.85
filterMethod · 0.80
updateMethod · 0.45
fitMethod · 0.45

Tested by

no test coverage detected