Tokenize prompts or de-tokenize tokens into texts. Args: model_path: the path of the tokenizer model.
| 417 | |
| 418 | |
| 419 | class Tokenizer: |
| 420 | """Tokenize prompts or de-tokenize tokens into texts. |
| 421 | |
| 422 | Args: |
| 423 | model_path: the path of the tokenizer model. |
| 424 | """ |
| 425 | |
| 426 | def __init__(self, model_path: str, trust_remote_code: bool = False): |
| 427 | from transformers import AutoConfig, PretrainedConfig |
| 428 | try: |
| 429 | model_cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code) |
| 430 | except Exception as e: # noqa |
| 431 | model_cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code) |
| 432 | is_gpt_oss = getattr(model_cfg, 'model_type', '') == 'gpt_oss' |
| 433 | from transformers.models.auto.tokenization_auto import get_tokenizer_config |
| 434 | tokenizer_config = get_tokenizer_config(model_path, trust_remote_code=trust_remote_code) |
| 435 | config_tokenizer_class = tokenizer_config.get('tokenizer_class') |
| 436 | if config_tokenizer_class == 'ChatGLM4Tokenizer': |
| 437 | self.model = ChatGLM4Tokenizer(model_path, trust_remote_code=trust_remote_code) |
| 438 | elif config_tokenizer_class == 'ChatGLMTokenizer': |
| 439 | self.model = ChatGLMTokenizer(model_path, trust_remote_code=trust_remote_code) |
| 440 | elif is_gpt_oss: |
| 441 | self.model = GptOssTokenizer(model_path, trust_remote_code=trust_remote_code) |
| 442 | else: |
| 443 | self.model = HuggingFaceTokenizer(model_path, trust_remote_code=trust_remote_code) |
| 444 | self.logger = get_logger('lmdeploy') |
| 445 | |
| 446 | @property |
| 447 | def vocab_size(self): |
| 448 | """Vocabulary size.""" |
| 449 | return self.model.vocab_size |
| 450 | |
| 451 | @property |
| 452 | def bos_token_id(self): |
| 453 | """Begin of the sentence token id.""" |
| 454 | return self.model.bos_token_id |
| 455 | |
| 456 | @property |
| 457 | def eos_token_id(self): |
| 458 | """End of the sentence token id.""" |
| 459 | return self.model.eos_token_id |
| 460 | |
| 461 | def get_vocab(self): |
| 462 | """Get vocab.""" |
| 463 | return self.model.get_vocab() |
| 464 | |
| 465 | def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True, **kwargs): |
| 466 | """Tokenize a prompt. |
| 467 | |
| 468 | Args: |
| 469 | s: a prompt. |
| 470 | add_bos: Whether to add ``bos`` token id when encoding the prompt. |
| 471 | add_special_tokens: Whether or not to add special tokens |
| 472 | when encoding the prompt. |
| 473 | |
| 474 | Returns: |
| 475 | list[int]: token ids. |
| 476 | """ |
no outgoing calls