Decodes a list of tokens into a string and a list of offsets. Each offset is the index into text corresponding to the start of each token. If UTF-8 character boundaries do not line up with token boundaries, the offset is the index of the first character that contains bytes f
(self, tokens: Sequence[int])
| 310 | return [self.decode_single_token_bytes(token) for token in tokens] |
| 311 | |
| 312 | def decode_with_offsets(self, tokens: Sequence[int]) -> tuple[str, list[int]]: |
| 313 | """Decodes a list of tokens into a string and a list of offsets. |
| 314 | |
| 315 | Each offset is the index into text corresponding to the start of each token. |
| 316 | If UTF-8 character boundaries do not line up with token boundaries, the offset is the index |
| 317 | of the first character that contains bytes from the token. |
| 318 | |
| 319 | This will currently raise if given tokens that decode to invalid UTF-8; this behaviour may |
| 320 | change in the future to be more permissive. |
| 321 | |
| 322 | >>> enc.decode_with_offsets([31373, 995]) |
| 323 | ('hello world', [0, 5]) |
| 324 | """ |
| 325 | token_bytes = self.decode_tokens_bytes(tokens) |
| 326 | |
| 327 | text_len = 0 |
| 328 | offsets = [] |
| 329 | for token in token_bytes: |
| 330 | offsets.append(max(0, text_len - (0x80 <= token[0] < 0xC0))) |
| 331 | text_len += sum(1 for c in token if not 0x80 <= c < 0xC0) |
| 332 | |
| 333 | # TODO: assess correctness for errors="ignore" and errors="replace" |
| 334 | text = b"".join(token_bytes).decode("utf-8", errors="strict") |
| 335 | return text, offsets |
| 336 | |
| 337 | def decode_batch( |
| 338 | self, batch: Sequence[Sequence[int]], *, errors: str = "replace", num_threads: int = 8 |