MCPcopy Index your code
hub / github.com/microsoft/BitNet / _split_whitespaces_or_nonwhitespaces

Method _split_whitespaces_or_nonwhitespaces

gpu/tokenizer.py:172–195  ·  view source on GitHub ↗

Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len` consecutive whitespaces or consecutive non-whitespaces.

(
        s: str, max_consecutive_slice_len: int
    )

Source from the content-addressed store, hash-verified

170
171 @staticmethod
172 def _split_whitespaces_or_nonwhitespaces(
173 s: str, max_consecutive_slice_len: int
174 ) -> Iterator[str]:
175 """
176 Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
177 consecutive whitespaces or consecutive non-whitespaces.
178 """
179 current_slice_len = 0
180 current_slice_is_space = s[0].isspace() if len(s) > 0 else False
181 slice_start = 0
182
183 for i in range(len(s)):
184 is_now_space = s[i].isspace()
185
186 if current_slice_is_space ^ is_now_space:
187 current_slice_len = 1
188 current_slice_is_space = is_now_space
189 else:
190 current_slice_len += 1
191 if current_slice_len > max_consecutive_slice_len:
192 yield s[slice_start:i]
193 slice_start = i
194 current_slice_len = 1
195 yield s[slice_start:]
196
197class ChatFormat:
198 def __init__(self, tokenizer: Tokenizer):

Callers 1

encodeMethod · 0.95

Calls

no outgoing calls

Tested by

no test coverage detected