Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len` consecutive whitespaces or consecutive non-whitespaces.
(
s: str, max_consecutive_slice_len: int
)
| 170 | |
| 171 | @staticmethod |
| 172 | def _split_whitespaces_or_nonwhitespaces( |
| 173 | s: str, max_consecutive_slice_len: int |
| 174 | ) -> Iterator[str]: |
| 175 | """ |
| 176 | Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len` |
| 177 | consecutive whitespaces or consecutive non-whitespaces. |
| 178 | """ |
| 179 | current_slice_len = 0 |
| 180 | current_slice_is_space = s[0].isspace() if len(s) > 0 else False |
| 181 | slice_start = 0 |
| 182 | |
| 183 | for i in range(len(s)): |
| 184 | is_now_space = s[i].isspace() |
| 185 | |
| 186 | if current_slice_is_space ^ is_now_space: |
| 187 | current_slice_len = 1 |
| 188 | current_slice_is_space = is_now_space |
| 189 | else: |
| 190 | current_slice_len += 1 |
| 191 | if current_slice_len > max_consecutive_slice_len: |
| 192 | yield s[slice_start:i] |
| 193 | slice_start = i |
| 194 | current_slice_len = 1 |
| 195 | yield s[slice_start:] |
| 196 | |
| 197 | class ChatFormat: |
| 198 | def __init__(self, tokenizer: Tokenizer): |