MCPcopy Index your code
hub / github.com/microsoft/BitNet / encode

Method encode

gpu/tokenizer.py:95–156  ·  view source on GitHub ↗

Encodes a string into a list of token IDs. Args: s (str): The input string to be encoded. bos (bool): Whether to prepend the beginning-of-sequence token. eos (bool): Whether to append the end-of-sequence token. allowed_tokens (

(
        self,
        s: str,
        *,
        bos: bool,
        eos: bool,
        allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
        disallowed_special: Union[Literal["all"], Collection[str]] = (),
    )

Source from the content-addressed store, hash-verified

93 )
94
95 def encode(
96 self,
97 s: str,
98 *,
99 bos: bool,
100 eos: bool,
101 allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
102 disallowed_special: Union[Literal["all"], Collection[str]] = (),
103 ) -> List[int]:
104 """
105 Encodes a string into a list of token IDs.
106
107 Args:
108 s (str): The input string to be encoded.
109 bos (bool): Whether to prepend the beginning-of-sequence token.
110 eos (bool): Whether to append the end-of-sequence token.
111 allowed_tokens ("all"|set[str]): allowed special tokens in string
112 disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string
113
114 Returns:
115 list[int]: A list of token IDs.
116
117 By default, setting disallowed_special=() encodes a string by ignoring
118 special tokens. Specifically:
119 - Setting `disallowed_special` to () will cause all text corresponding
120 to special tokens to be encoded as natural text (insteading of raising
121 an error).
122 - Setting `allowed_special` to "all" will treat all text corresponding
123 to special tokens to be encoded as special tokens.
124 """
125 assert type(s) is str
126
127 # The tiktoken tokenizer can handle <=400k chars without
128 # pyo3_runtime.PanicException.
129 TIKTOKEN_MAX_ENCODE_CHARS = 400_000
130
131 # https://github.com/openai/tiktoken/issues/195
132 # Here we iterate over subsequences and split if we exceed the limit
133 # of max consecutive non-whitespace or whitespace characters.
134 MAX_NO_WHITESPACES_CHARS = 25_000
135
136 substrs = (
137 substr
138 for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
139 for substr in self._split_whitespaces_or_nonwhitespaces(
140 s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
141 )
142 )
143 t: List[int] = []
144 for substr in substrs:
145 t.extend(
146 self.model.encode(
147 substr,
148 allowed_special=allowed_special,
149 disallowed_special=disallowed_special,
150 )
151 )
152 if bos:

Callers 15

get_vocab_baseMethod · 0.80
get_vocab_base_preMethod · 0.80
added_tokensMethod · 0.80
sentencepiece_tokensMethod · 0.80
added_tokensMethod · 0.80
hf_tokensMethod · 0.80
added_tokensMethod · 0.80
get_vocab_base_preMethod · 0.80
added_tokensMethod · 0.80
sentencepiece_tokensMethod · 0.80

Tested by

no test coverage detected