(fp, N)
| 29 | |
| 30 | |
| 31 | def compare_probs(fp, N): |
| 32 | MLE = MLENGram(N, unk=False, filter_punctuation=False, filter_stopwords=False) |
| 33 | MLE.train(fp, encoding="utf-8-sig") |
| 34 | |
| 35 | add_y, mle_y, gtt_y = [], [], [] |
| 36 | addu_y, mleu_y, gttu_y = [], [], [] |
| 37 | seen = ("<bol>", "the") |
| 38 | unseen = ("<bol>", "asdf") |
| 39 | |
| 40 | GTT = GoodTuringNGram( |
| 41 | N, conf=1.96, unk=False, filter_stopwords=False, filter_punctuation=False |
| 42 | ) |
| 43 | GTT.train(fp, encoding="utf-8-sig") |
| 44 | |
| 45 | gtt_prob = GTT.log_prob(seen, N) |
| 46 | gtt_prob_u = GTT.log_prob(unseen, N) |
| 47 | |
| 48 | for K in np.linspace(0, 10, 20): |
| 49 | ADD = AdditiveNGram( |
| 50 | N, K, unk=False, filter_punctuation=False, filter_stopwords=False |
| 51 | ) |
| 52 | ADD.train(fp, encoding="utf-8-sig") |
| 53 | |
| 54 | add_prob = ADD.log_prob(seen, N) |
| 55 | mle_prob = MLE.log_prob(seen, N) |
| 56 | |
| 57 | add_y.append(add_prob) |
| 58 | mle_y.append(mle_prob) |
| 59 | gtt_y.append(gtt_prob) |
| 60 | |
| 61 | mle_prob_u = MLE.log_prob(unseen, N) |
| 62 | add_prob_u = ADD.log_prob(unseen, N) |
| 63 | |
| 64 | addu_y.append(add_prob_u) |
| 65 | mleu_y.append(mle_prob_u) |
| 66 | gttu_y.append(gtt_prob_u) |
| 67 | |
| 68 | plt.plot(np.linspace(0, 10, 20), add_y, label="Additive (seen ngram)") |
| 69 | plt.plot(np.linspace(0, 10, 20), addu_y, label="Additive (unseen ngram)") |
| 70 | # plt.plot(np.linspace(0, 10, 20), gtt_y, label="Good-Turing (seen ngram)") |
| 71 | # plt.plot(np.linspace(0, 10, 20), gttu_y, label="Good-Turing (unseen ngram)") |
| 72 | plt.plot(np.linspace(0, 10, 20), mle_y, "--", label="MLE (seen ngram)") |
| 73 | plt.xlabel("K") |
| 74 | plt.ylabel("log P(sequence)") |
| 75 | plt.legend() |
| 76 | plt.savefig("img/add_smooth.png") |
| 77 | plt.close("all") |
| 78 | |
| 79 | |
| 80 | def plot_gt_freqs(fp): |
nothing calls this directly
no test coverage detected