| 52 | def _encode_sentence(self, chunk:str) -> list[int]: |
| 53 | return [tok for word in self._split_to_word.findall(chunk) for tok in self._encode_word(word.encode())] |
| 54 | def encode(self, text:str) -> list[int]: |
| 55 | tokens: list[int] = [] |
| 56 | pos = 0 |
| 57 | for match in self._split_to_sentence.finditer(text): |
| 58 | tokens.extend(self._encode_sentence(text[pos:match.start(0)]) + [self._special_tokens[text[match.start(0):match.end(0)]]]) |
| 59 | pos = match.end(0) |
| 60 | return tokens + self._encode_sentence(text[pos:]) |
| 61 | |
| 62 | def decode(self, ids:list[int]) -> str: return b''.join(self._tok2bytes[tid] for tid in ids).decode(errors='replace') |
| 63 | def stream_decoder(self) -> typing.Callable[..., str]: |