MCPcopy
hub / github.com/ddbourgin/numpy-ml / MLEGold

Class MLEGold

numpy_ml/tests/test_ngram.py:12–104  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

10
11
12class MLEGold:
13 def __init__(
14 self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True
15 ):
16 self.N = N
17 self.K = K
18 self.unk = unk
19 self.filter_stopwords = filter_stopwords
20 self.filter_punctuation = filter_punctuation
21
22 self.hyperparameters = {
23 "N": N,
24 "K": K,
25 "unk": unk,
26 "filter_stopwords": filter_stopwords,
27 "filter_punctuation": filter_punctuation,
28 }
29
30 def train(self, corpus_fp, vocab=None, encoding=None):
31 N = self.N
32 H = self.hyperparameters
33 models, counts = {}, {}
34 grams = {n: [] for n in range(1, N + 1)}
35 gg = {n: [] for n in range(1, N + 1)}
36 filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]
37
38 n_words = 0
39 tokens = set([])
40
41 with open(corpus_fp, "r", encoding=encoding) as text:
42 for line in text:
43 words = tokenize_words(line, filter_punc, filter_stop)
44
45 if vocab is not None:
46 words = vocab.filter(words, H["unk"])
47
48 if len(words) == 0:
49 continue
50
51 n_words += len(words)
52 tokens.update(words)
53
54 # calculate n, n-1, ... 1-grams
55 for n in range(1, N + 1):
56 grams[n].append(
57 nltk.ngrams(
58 words,
59 n,
60 pad_left=True,
61 pad_right=True,
62 left_pad_symbol="<bol>",
63 right_pad_symbol="<eol>",
64 )
65 )
66
67 gg[n].extend(
68 list(
69 nltk.ngrams(

Callers 1

test_mleFunction · 0.85

Calls

no outgoing calls

Tested by 1

test_mleFunction · 0.68