| 105 | |
| 106 | |
| 107 | class AdditiveGold: |
| 108 | def __init__( |
| 109 | self, N, K=1, unk=True, filter_stopwords=True, filter_punctuation=True |
| 110 | ): |
| 111 | self.N = N |
| 112 | self.K = K |
| 113 | self.unk = unk |
| 114 | self.filter_stopwords = filter_stopwords |
| 115 | self.filter_punctuation = filter_punctuation |
| 116 | |
| 117 | self.hyperparameters = { |
| 118 | "N": N, |
| 119 | "K": K, |
| 120 | "unk": unk, |
| 121 | "filter_stopwords": filter_stopwords, |
| 122 | "filter_punctuation": filter_punctuation, |
| 123 | } |
| 124 | |
| 125 | def train(self, corpus_fp, vocab=None, encoding=None): |
| 126 | N = self.N |
| 127 | H = self.hyperparameters |
| 128 | models, counts = {}, {} |
| 129 | grams = {n: [] for n in range(1, N + 1)} |
| 130 | gg = {n: [] for n in range(1, N + 1)} |
| 131 | filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"] |
| 132 | |
| 133 | n_words = 0 |
| 134 | tokens = set() |
| 135 | |
| 136 | with open(corpus_fp, "r", encoding=encoding) as text: |
| 137 | for line in text: |
| 138 | words = tokenize_words(line, filter_punc, filter_stop) |
| 139 | |
| 140 | if vocab is not None: |
| 141 | words = vocab.filter(words, H["unk"]) |
| 142 | |
| 143 | if len(words) == 0: |
| 144 | continue |
| 145 | |
| 146 | n_words += len(words) |
| 147 | tokens.update(words) |
| 148 | |
| 149 | # calculate n, n-1, ... 1-grams |
| 150 | for n in range(1, N + 1): |
| 151 | grams[n].append( |
| 152 | nltk.ngrams( |
| 153 | words, |
| 154 | n, |
| 155 | pad_left=True, |
| 156 | pad_right=True, |
| 157 | left_pad_symbol="<bol>", |
| 158 | right_pad_symbol="<eol>", |
| 159 | ) |
| 160 | ) |
| 161 | |
| 162 | gg[n].extend( |
| 163 | list( |
| 164 | nltk.ngrams( |
no outgoing calls