(self, corpus_fp, vocab=None, encoding=None)
| 28 | } |
| 29 | |
| 30 | def train(self, corpus_fp, vocab=None, encoding=None): |
| 31 | N = self.N |
| 32 | H = self.hyperparameters |
| 33 | models, counts = {}, {} |
| 34 | grams = {n: [] for n in range(1, N + 1)} |
| 35 | gg = {n: [] for n in range(1, N + 1)} |
| 36 | filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"] |
| 37 | |
| 38 | n_words = 0 |
| 39 | tokens = set([]) |
| 40 | |
| 41 | with open(corpus_fp, "r", encoding=encoding) as text: |
| 42 | for line in text: |
| 43 | words = tokenize_words(line, filter_punc, filter_stop) |
| 44 | |
| 45 | if vocab is not None: |
| 46 | words = vocab.filter(words, H["unk"]) |
| 47 | |
| 48 | if len(words) == 0: |
| 49 | continue |
| 50 | |
| 51 | n_words += len(words) |
| 52 | tokens.update(words) |
| 53 | |
| 54 | # calculate n, n-1, ... 1-grams |
| 55 | for n in range(1, N + 1): |
| 56 | grams[n].append( |
| 57 | nltk.ngrams( |
| 58 | words, |
| 59 | n, |
| 60 | pad_left=True, |
| 61 | pad_right=True, |
| 62 | left_pad_symbol="<bol>", |
| 63 | right_pad_symbol="<eol>", |
| 64 | ) |
| 65 | ) |
| 66 | |
| 67 | gg[n].extend( |
| 68 | list( |
| 69 | nltk.ngrams( |
| 70 | words, |
| 71 | n, |
| 72 | pad_left=True, |
| 73 | pad_right=True, |
| 74 | left_pad_symbol="<bol>", |
| 75 | right_pad_symbol="<eol>", |
| 76 | ) |
| 77 | ) |
| 78 | ) |
| 79 | |
| 80 | for n in range(1, N + 1): |
| 81 | counts[n] = nltk.FreqDist(gg[n]) |
| 82 | models[n] = nltk.lm.MLE(order=n) |
| 83 | models[n].fit(grams[n], tokens) |
| 84 | |
| 85 | self.counts = counts |
| 86 | self.n_words = n_words |
| 87 | self._models = models |
no test coverage detected