MCPcopy Index your code
hub / github.com/deepspeedai/DeepSpeedExamples / BertWordPieceTokenizer

Class BertWordPieceTokenizer

Megatron-LM/data_utils/tokenization.py:687–796  ·  view source on GitHub ↗

Loads a pretrained WordPiece tokenizer from `cache_dir` for tokenization in BERT training. Default to bert-large-uncased tokenizer.

Source from the content-addressed store, hash-verified

685 return self.sp.DecodeTokens(Tokens)
686
687class BertWordPieceTokenizer(Tokenizer):
688 """
689 Loads a pretrained WordPiece tokenizer from `cache_dir` for tokenization
690 in BERT training. Default to bert-large-uncased tokenizer.
691 """
692 def __init__(self, tokenizer_model_type=None, cache_dir=None, **kwargs):
693 # default to bert-large-uncased tokenizer
694 if tokenizer_model_type not in PRETRAINED_VOCAB_ARCHIVE_MAP:
695 tokenizer_model_type = 'bert-large-uncased'
696 if torch.distributed.get_rank() == 0:
697 print('loading BertWordPieceTokenizer (', tokenizer_model_type, ') from cache_dir ', cache_dir)
698 do_lower_case = not ('-cased' in tokenizer_model_type or 'chinese' in tokenizer_model_type)
699 self.text_tokenizer = BertTokenizer.from_pretrained(tokenizer_model_type, do_lower_case=do_lower_case, cache_dir=cache_dir)
700 if torch.distributed.get_rank() == 0:
701 print('loaded', tokenizer_model_type)
702 # disable max len warnings by increasing max len
703 self.text_tokenizer.max_len = int(1e12)
704
705 # set command tokens from wordpiece tokenizer values
706 self.num_command_tokens = 5
707 self.num_tokens = len(self.text_tokenizer.vocab)
708 self.num_text_tokens = self.num_tokens-5
709 self.num_type_tokens = 2
710
711 self._command_tokens = [
712 CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']),
713 CommandToken('ENC', '[CLS]', self.text_tokenizer.vocab['[CLS]']),
714 CommandToken('MASK', '[MASK]', self.text_tokenizer.vocab['[MASK]']),
715 CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']),
716 CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']),
717 ]
718 self.command_name_map = {tok.name: tok for tok in self._command_tokens}
719 self.command_token_map = {tok.token: tok for tok in self._command_tokens}
720 self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
721
722 # set type tokens
723 self.type_tokens = [
724 TypeToken('str0', '<str0>', 0),
725 TypeToken('str1', '<str1>', 1),
726 ]
727 self.type_name_map = {tok.name: tok for tok in self.type_tokens}
728 self.type_token_map = {tok.token: tok for tok in self.type_tokens}
729 self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
730
731 # parse tokens and vocabs from tokenizer
732
733 self._tokens = list(self.text_tokenizer.vocab.keys())
734 self._vocab = {k:v for k,v in self.text_tokenizer.vocab.items()}
735
736 self._text_tokens = list(self._tokens)
737 self._text_token_vocab = {k:v for k,v in self.text_tokenizer.vocab.items()}
738
739 self._command_token_tokens = list(self.command_token_map.keys())
740 self._command_token_vocab = {t:Id for Id,t in self.command_id_map.items()}
741
742 self._token_types = list(self.type_token_map.keys())
743 self._token_type_vocab = {t:Id for Id, t in self.type_id_map.items()}
744

Callers 1

make_tokenizerFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected