MCPcopy
hub / github.com/ddbourgin/numpy-ml / MLENGram

Class MLENGram

numpy_ml/ngram/ngram.py:313–361  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

311
312
313class MLENGram(NGramBase):
314 def __init__(self, N, unk=True, filter_stopwords=True, filter_punctuation=True):
315 """
316 A simple, unsmoothed N-gram model.
317
318 Parameters
319 ----------
320 N : int
321 The maximum length (in words) of the context-window to use in the
322 langauge model. Model will compute all n-grams from 1, ..., N.
323 unk : bool
324 Whether to include the ``<unk>`` (unknown) token in the LM. Default
325 is True.
326 filter_stopwords : bool
327 Whether to remove stopwords before training. Default is True.
328 filter_punctuation : bool
329 Whether to remove punctuation before training. Default is True.
330 """
331 super().__init__(N, unk, filter_stopwords, filter_punctuation)
332
333 self.hyperparameters["id"] = "MLENGram"
334
335 def log_prob(self, words, N):
336 """
337 Compute the log probability of a sequence of words under the
338 unsmoothed, maximum-likelihood `N`-gram language model.
339
340 Parameters
341 ----------
342 words : list of strings
343 A sequence of words
344 N : int
345 The gram-size of the language model to use when calculating the log
346 probabilities of the sequence
347
348 Returns
349 -------
350 total_prob : float
351 The total log-probability of the sequence `words` under the
352 `N`-gram language model
353 """
354 return self._log_prob(words, N)
355
356 def _log_ngram_prob(self, ngram):
357 """Return the unsmoothed log probability of the ngram"""
358 N = len(ngram)
359 num = self.counts[N][ngram]
360 den = self.counts[N - 1][ngram[:-1]] if N > 1 else self.n_words[1]
361 return np.log(num) - np.log(den) if (den > 0 and num > 0) else -np.inf
362
363
364class AdditiveNGram(NGramBase):

Callers 3

compare_probsFunction · 0.90
plot_gt_freqsFunction · 0.90
test_mleFunction · 0.85

Calls

no outgoing calls

Tested by 1

test_mleFunction · 0.68