Encodes a string into a list of token IDs. Args: s (str): The input string to be encoded. bos (bool): Whether to prepend the beginning-of-sequence token. eos (bool): Whether to append the end-of-sequence token. allowed_tokens (
(
self,
s: str,
*,
bos: bool,
eos: bool,
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
disallowed_special: Union[Literal["all"], Collection[str]] = (),
)
| 93 | ) |
| 94 | |
| 95 | def encode( |
| 96 | self, |
| 97 | s: str, |
| 98 | *, |
| 99 | bos: bool, |
| 100 | eos: bool, |
| 101 | allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), |
| 102 | disallowed_special: Union[Literal["all"], Collection[str]] = (), |
| 103 | ) -> List[int]: |
| 104 | """ |
| 105 | Encodes a string into a list of token IDs. |
| 106 | |
| 107 | Args: |
| 108 | s (str): The input string to be encoded. |
| 109 | bos (bool): Whether to prepend the beginning-of-sequence token. |
| 110 | eos (bool): Whether to append the end-of-sequence token. |
| 111 | allowed_tokens ("all"|set[str]): allowed special tokens in string |
| 112 | disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string |
| 113 | |
| 114 | Returns: |
| 115 | list[int]: A list of token IDs. |
| 116 | |
| 117 | By default, setting disallowed_special=() encodes a string by ignoring |
| 118 | special tokens. Specifically: |
| 119 | - Setting `disallowed_special` to () will cause all text corresponding |
| 120 | to special tokens to be encoded as natural text (insteading of raising |
| 121 | an error). |
| 122 | - Setting `allowed_special` to "all" will treat all text corresponding |
| 123 | to special tokens to be encoded as special tokens. |
| 124 | """ |
| 125 | assert type(s) is str |
| 126 | |
| 127 | # The tiktoken tokenizer can handle <=400k chars without |
| 128 | # pyo3_runtime.PanicException. |
| 129 | TIKTOKEN_MAX_ENCODE_CHARS = 400_000 |
| 130 | |
| 131 | # https://github.com/openai/tiktoken/issues/195 |
| 132 | # Here we iterate over subsequences and split if we exceed the limit |
| 133 | # of max consecutive non-whitespace or whitespace characters. |
| 134 | MAX_NO_WHITESPACES_CHARS = 25_000 |
| 135 | |
| 136 | substrs = ( |
| 137 | substr |
| 138 | for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS) |
| 139 | for substr in self._split_whitespaces_or_nonwhitespaces( |
| 140 | s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS |
| 141 | ) |
| 142 | ) |
| 143 | t: List[int] = [] |
| 144 | for substr in substrs: |
| 145 | t.extend( |
| 146 | self.model.encode( |
| 147 | substr, |
| 148 | allowed_special=allowed_special, |
| 149 | disallowed_special=disallowed_special, |
| 150 | ) |
| 151 | ) |
| 152 | if bos: |
no test coverage detected