| 10 | |
| 11 | |
| 12 | class MLEGold: |
| 13 | def __init__( |
| 14 | self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True |
| 15 | ): |
| 16 | self.N = N |
| 17 | self.K = K |
| 18 | self.unk = unk |
| 19 | self.filter_stopwords = filter_stopwords |
| 20 | self.filter_punctuation = filter_punctuation |
| 21 | |
| 22 | self.hyperparameters = { |
| 23 | "N": N, |
| 24 | "K": K, |
| 25 | "unk": unk, |
| 26 | "filter_stopwords": filter_stopwords, |
| 27 | "filter_punctuation": filter_punctuation, |
| 28 | } |
| 29 | |
| 30 | def train(self, corpus_fp, vocab=None, encoding=None): |
| 31 | N = self.N |
| 32 | H = self.hyperparameters |
| 33 | models, counts = {}, {} |
| 34 | grams = {n: [] for n in range(1, N + 1)} |
| 35 | gg = {n: [] for n in range(1, N + 1)} |
| 36 | filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"] |
| 37 | |
| 38 | n_words = 0 |
| 39 | tokens = set([]) |
| 40 | |
| 41 | with open(corpus_fp, "r", encoding=encoding) as text: |
| 42 | for line in text: |
| 43 | words = tokenize_words(line, filter_punc, filter_stop) |
| 44 | |
| 45 | if vocab is not None: |
| 46 | words = vocab.filter(words, H["unk"]) |
| 47 | |
| 48 | if len(words) == 0: |
| 49 | continue |
| 50 | |
| 51 | n_words += len(words) |
| 52 | tokens.update(words) |
| 53 | |
| 54 | # calculate n, n-1, ... 1-grams |
| 55 | for n in range(1, N + 1): |
| 56 | grams[n].append( |
| 57 | nltk.ngrams( |
| 58 | words, |
| 59 | n, |
| 60 | pad_left=True, |
| 61 | pad_right=True, |
| 62 | left_pad_symbol="<bol>", |
| 63 | right_pad_symbol="<eol>", |
| 64 | ) |
| 65 | ) |
| 66 | |
| 67 | gg[n].extend( |
| 68 | list( |
| 69 | nltk.ngrams( |