| 76 | re.IGNORECASE) |
| 77 | |
| 78 | def bpe(self, token): |
| 79 | if token in self.cache: |
| 80 | return self.cache[token] |
| 81 | word = tuple(token[:-1]) + (token[-1] + '</w>',) |
| 82 | pairs = get_pairs(word) |
| 83 | |
| 84 | if not pairs: |
| 85 | return token + '</w>' |
| 86 | |
| 87 | while True: |
| 88 | bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) |
| 89 | if bigram not in self.bpe_ranks: |
| 90 | break |
| 91 | first, second = bigram |
| 92 | new_word = [] |
| 93 | i = 0 |
| 94 | while i < len(word): |
| 95 | try: |
| 96 | j = word.index(first, i) |
| 97 | new_word.extend(word[i:j]) |
| 98 | i = j |
| 99 | except: |
| 100 | new_word.extend(word[i:]) |
| 101 | break |
| 102 | |
| 103 | if word[i] == first and i < len(word) - 1 and word[i + 1] == second: |
| 104 | new_word.append(first + second) |
| 105 | i += 2 |
| 106 | else: |
| 107 | new_word.append(word[i]) |
| 108 | i += 1 |
| 109 | new_word = tuple(new_word) |
| 110 | word = new_word |
| 111 | if len(word) == 1: |
| 112 | break |
| 113 | else: |
| 114 | pairs = get_pairs(word) |
| 115 | word = ' '.join(word) |
| 116 | self.cache[token] = word |
| 117 | return word |
| 118 | |
| 119 | def encode(self, text): |
| 120 | bpe_tokens = [] |