MCPcopy
hub / github.com/InternLM/lmdeploy / Tokenizer

Class Tokenizer

lmdeploy/tokenizer.py:419–550  ·  view source on GitHub ↗

Tokenize prompts or de-tokenize tokens into texts. Args: model_path: the path of the tokenizer model.

Source from the content-addressed store, hash-verified

417
418
419class Tokenizer:
420 """Tokenize prompts or de-tokenize tokens into texts.
421
422 Args:
423 model_path: the path of the tokenizer model.
424 """
425
426 def __init__(self, model_path: str, trust_remote_code: bool = False):
427 from transformers import AutoConfig, PretrainedConfig
428 try:
429 model_cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code)
430 except Exception as e: # noqa
431 model_cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code)
432 is_gpt_oss = getattr(model_cfg, 'model_type', '') == 'gpt_oss'
433 from transformers.models.auto.tokenization_auto import get_tokenizer_config
434 tokenizer_config = get_tokenizer_config(model_path, trust_remote_code=trust_remote_code)
435 config_tokenizer_class = tokenizer_config.get('tokenizer_class')
436 if config_tokenizer_class == 'ChatGLM4Tokenizer':
437 self.model = ChatGLM4Tokenizer(model_path, trust_remote_code=trust_remote_code)
438 elif config_tokenizer_class == 'ChatGLMTokenizer':
439 self.model = ChatGLMTokenizer(model_path, trust_remote_code=trust_remote_code)
440 elif is_gpt_oss:
441 self.model = GptOssTokenizer(model_path, trust_remote_code=trust_remote_code)
442 else:
443 self.model = HuggingFaceTokenizer(model_path, trust_remote_code=trust_remote_code)
444 self.logger = get_logger('lmdeploy')
445
446 @property
447 def vocab_size(self):
448 """Vocabulary size."""
449 return self.model.vocab_size
450
451 @property
452 def bos_token_id(self):
453 """Begin of the sentence token id."""
454 return self.model.bos_token_id
455
456 @property
457 def eos_token_id(self):
458 """End of the sentence token id."""
459 return self.model.eos_token_id
460
461 def get_vocab(self):
462 """Get vocab."""
463 return self.model.get_vocab()
464
465 def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True, **kwargs):
466 """Tokenize a prompt.
467
468 Args:
469 s: a prompt.
470 add_bos: Whether to add ``bos`` token id when encoding the prompt.
471 add_special_tokens: Whether or not to add special tokens
472 when encoding the prompt.
473
474 Returns:
475 list[int]: token ids.
476 """

Callers 8

__init__Method · 0.90
__init__Method · 0.90
__init__Method · 0.90
__init__Method · 0.90
test_tokenizerFunction · 0.90
test_glm4_special_tokenFunction · 0.90

Calls

no outgoing calls

Tested by 4

test_tokenizerFunction · 0.72
test_glm4_special_tokenFunction · 0.72