| 311 | |
| 312 | |
| 313 | class MLENGram(NGramBase): |
| 314 | def __init__(self, N, unk=True, filter_stopwords=True, filter_punctuation=True): |
| 315 | """ |
| 316 | A simple, unsmoothed N-gram model. |
| 317 | |
| 318 | Parameters |
| 319 | ---------- |
| 320 | N : int |
| 321 | The maximum length (in words) of the context-window to use in the |
| 322 | langauge model. Model will compute all n-grams from 1, ..., N. |
| 323 | unk : bool |
| 324 | Whether to include the ``<unk>`` (unknown) token in the LM. Default |
| 325 | is True. |
| 326 | filter_stopwords : bool |
| 327 | Whether to remove stopwords before training. Default is True. |
| 328 | filter_punctuation : bool |
| 329 | Whether to remove punctuation before training. Default is True. |
| 330 | """ |
| 331 | super().__init__(N, unk, filter_stopwords, filter_punctuation) |
| 332 | |
| 333 | self.hyperparameters["id"] = "MLENGram" |
| 334 | |
| 335 | def log_prob(self, words, N): |
| 336 | """ |
| 337 | Compute the log probability of a sequence of words under the |
| 338 | unsmoothed, maximum-likelihood `N`-gram language model. |
| 339 | |
| 340 | Parameters |
| 341 | ---------- |
| 342 | words : list of strings |
| 343 | A sequence of words |
| 344 | N : int |
| 345 | The gram-size of the language model to use when calculating the log |
| 346 | probabilities of the sequence |
| 347 | |
| 348 | Returns |
| 349 | ------- |
| 350 | total_prob : float |
| 351 | The total log-probability of the sequence `words` under the |
| 352 | `N`-gram language model |
| 353 | """ |
| 354 | return self._log_prob(words, N) |
| 355 | |
| 356 | def _log_ngram_prob(self, ngram): |
| 357 | """Return the unsmoothed log probability of the ngram""" |
| 358 | N = len(ngram) |
| 359 | num = self.counts[N][ngram] |
| 360 | den = self.counts[N - 1][ngram[:-1]] if N > 1 else self.n_words[1] |
| 361 | return np.log(num) - np.log(den) if (den > 0 and num > 0) else -np.inf |
| 362 | |
| 363 | |
| 364 | class AdditiveNGram(NGramBase): |
no outgoing calls