Draws a scatterplot of the empirical frequencies of the counted species versus their Simple Good Turing smoothed values, in rank order. Depends on pylab and matplotlib.
(fp)
| 78 | |
| 79 | |
| 80 | def plot_gt_freqs(fp): |
| 81 | """ |
| 82 | Draws a scatterplot of the empirical frequencies of the counted species |
| 83 | versus their Simple Good Turing smoothed values, in rank order. Depends on |
| 84 | pylab and matplotlib. |
| 85 | """ |
| 86 | MLE = MLENGram(1, filter_punctuation=False, filter_stopwords=False) |
| 87 | MLE.train(fp, encoding="utf-8-sig") |
| 88 | counts = dict(MLE.counts[1]) |
| 89 | |
| 90 | GT = GoodTuringNGram(1, filter_stopwords=False, filter_punctuation=False) |
| 91 | GT.train(fp, encoding="utf-8-sig") |
| 92 | |
| 93 | ADD = AdditiveNGram(1, 1, filter_punctuation=False, filter_stopwords=False) |
| 94 | ADD.train(fp, encoding="utf-8-sig") |
| 95 | |
| 96 | tot = float(sum(counts.values())) |
| 97 | freqs = dict([(token, cnt / tot) for token, cnt in counts.items()]) |
| 98 | sgt_probs = dict([(tok, np.exp(GT.log_prob(tok, 1))) for tok in counts.keys()]) |
| 99 | as_probs = dict([(tok, np.exp(ADD.log_prob(tok, 1))) for tok in counts.keys()]) |
| 100 | |
| 101 | X, Y = np.arange(len(freqs)), sorted(freqs.values(), reverse=True) |
| 102 | plt.loglog(X, Y, "k+", alpha=0.25, label="MLE") |
| 103 | |
| 104 | X, Y = np.arange(len(sgt_probs)), sorted(sgt_probs.values(), reverse=True) |
| 105 | plt.loglog(X, Y, "r+", alpha=0.25, label="simple Good-Turing") |
| 106 | |
| 107 | X, Y = np.arange(len(as_probs)), sorted(as_probs.values(), reverse=True) |
| 108 | plt.loglog(X, Y, "b+", alpha=0.25, label="Laplace smoothing") |
| 109 | |
| 110 | plt.xlabel("Rank") |
| 111 | plt.ylabel("Probability") |
| 112 | plt.legend() |
| 113 | plt.tight_layout() |
| 114 | plt.savefig("img/rank_probs.png") |
| 115 | plt.close("all") |
nothing calls this directly
no test coverage detected