Tokenizer of GLM4.
| 350 | |
| 351 | |
| 352 | class ChatGLM4Tokenizer(HuggingFaceTokenizer): |
| 353 | """Tokenizer of GLM4.""" |
| 354 | |
| 355 | def __init__(self, model_path, trust_remote_code: bool = False): |
| 356 | super().__init__(model_path, trust_remote_code=trust_remote_code) |
| 357 | original_pad = self.model._pad |
| 358 | |
| 359 | def __pad(*args, **kwargs): |
| 360 | if 'padding_side' in kwargs: |
| 361 | kwargs.pop('padding_side') |
| 362 | return original_pad(*args, **kwargs) |
| 363 | |
| 364 | # fix for transformers>4.45.0 |
| 365 | self.model._pad = __pad |
| 366 | |
| 367 | def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True, **kwargs): |
| 368 | """Tokenize a prompt.""" |
| 369 | # ChtGLM4Tokenizer hardcode `add_speical_tokens=False` when tokenizing |
| 370 | # a prompt. Refer to https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/tokenization_chatglm.py#L227 # noqa E501 |
| 371 | return super().encode(s, add_bos, add_special_tokens=False, **kwargs) |
| 372 | |
| 373 | |
| 374 | class ChatGLMTokenizer(HuggingFaceTokenizer): |