(self, text)
| 99 | return word |
| 100 | |
| 101 | def encode(self, text): |
| 102 | bpe_tokens = [] |
| 103 | for token in re.findall(self.pat, text): |
| 104 | token = "".join(self.byte_encoder[b] for b in token.encode("utf-8")) |
| 105 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")) |
| 106 | return bpe_tokens |
| 107 | |
| 108 | def decode(self, tokens): |
| 109 | text = "".join([self.decoder[token] for token in tokens]) |