Encodes a string into a list of tokens using the underlying tokenizer. Args: content: The string to encode. Returns: A list of integer tokens.
(self, content: str)
| 2502 | self.tokenizer: TokenizerInterface = tokenizer |
| 2503 | |
| 2504 | def encode(self, content: str) -> List[int]: |
| 2505 | """ |
| 2506 | Encodes a string into a list of tokens using the underlying tokenizer. |
| 2507 | |
| 2508 | Args: |
| 2509 | content: The string to encode. |
| 2510 | |
| 2511 | Returns: |
| 2512 | A list of integer tokens. |
| 2513 | """ |
| 2514 | try: |
| 2515 | return self.tokenizer.encode(content) |
| 2516 | except ValueError as e: |
| 2517 | # tiktoken (and some other tokenizers) raise ValueError when the |
| 2518 | # content contains literal special-token strings such as |
| 2519 | # "<|endoftext|>", because by default disallowed_special is the |
| 2520 | # full set of special tokens. This crashes document indexing on |
| 2521 | # any user content that happens to contain those strings — common |
| 2522 | # in documentation, notes, or model output captured in source |
| 2523 | # corpora. Retry with disallowed_special=() so the tokens are |
| 2524 | # encoded as ordinary text. Tokenizers that don't accept the |
| 2525 | # kwarg fall through and re-raise the original error. |
| 2526 | if "special token" not in str(e): |
| 2527 | raise |
| 2528 | try: |
| 2529 | return self.tokenizer.encode(content, disallowed_special=()) |
| 2530 | except TypeError: |
| 2531 | raise e |
| 2532 | |
| 2533 | def decode(self, tokens: List[int]) -> str: |
| 2534 | """ |
no outgoing calls