Replace `bigram` with a single token and update vocab accordingly
(self, bigram, vocab)
| 271 | return pair_counts |
| 272 | |
| 273 | def _merge(self, bigram, vocab): |
| 274 | """Replace `bigram` with a single token and update vocab accordingly""" |
| 275 | v_out = {} |
| 276 | bg = re.escape(" ".join(bigram)) |
| 277 | bigram_regex = re.compile(r"(?<!\S)" + bg + r"(?!\S)") |
| 278 | for word in vocab.keys(): |
| 279 | # bigram "a b" becomes "a-b" |
| 280 | w_out = bigram_regex.sub("-".join(bigram), word) |
| 281 | v_out[w_out] = vocab[word] |
| 282 | return v_out |
| 283 | |
| 284 | def transform(self, text): |
| 285 | """ |