Loads a pretrained WordPiece tokenizer from `cache_dir` for tokenization in BERT training. Default to bert-large-uncased tokenizer.
| 685 | return self.sp.DecodeTokens(Tokens) |
| 686 | |
| 687 | class BertWordPieceTokenizer(Tokenizer): |
| 688 | """ |
| 689 | Loads a pretrained WordPiece tokenizer from `cache_dir` for tokenization |
| 690 | in BERT training. Default to bert-large-uncased tokenizer. |
| 691 | """ |
| 692 | def __init__(self, tokenizer_model_type=None, cache_dir=None, **kwargs): |
| 693 | # default to bert-large-uncased tokenizer |
| 694 | if tokenizer_model_type not in PRETRAINED_VOCAB_ARCHIVE_MAP: |
| 695 | tokenizer_model_type = 'bert-large-uncased' |
| 696 | if torch.distributed.get_rank() == 0: |
| 697 | print('loading BertWordPieceTokenizer (', tokenizer_model_type, ') from cache_dir ', cache_dir) |
| 698 | do_lower_case = not ('-cased' in tokenizer_model_type or 'chinese' in tokenizer_model_type) |
| 699 | self.text_tokenizer = BertTokenizer.from_pretrained(tokenizer_model_type, do_lower_case=do_lower_case, cache_dir=cache_dir) |
| 700 | if torch.distributed.get_rank() == 0: |
| 701 | print('loaded', tokenizer_model_type) |
| 702 | # disable max len warnings by increasing max len |
| 703 | self.text_tokenizer.max_len = int(1e12) |
| 704 | |
| 705 | # set command tokens from wordpiece tokenizer values |
| 706 | self.num_command_tokens = 5 |
| 707 | self.num_tokens = len(self.text_tokenizer.vocab) |
| 708 | self.num_text_tokens = self.num_tokens-5 |
| 709 | self.num_type_tokens = 2 |
| 710 | |
| 711 | self._command_tokens = [ |
| 712 | CommandToken('pad', '[PAD]', self.text_tokenizer.vocab['[PAD]']), |
| 713 | CommandToken('ENC', '[CLS]', self.text_tokenizer.vocab['[CLS]']), |
| 714 | CommandToken('MASK', '[MASK]', self.text_tokenizer.vocab['[MASK]']), |
| 715 | CommandToken('unk', '[UNK]', self.text_tokenizer.vocab['[UNK]']), |
| 716 | CommandToken('sep', '[SEP]', self.text_tokenizer.vocab['[SEP]']), |
| 717 | ] |
| 718 | self.command_name_map = {tok.name: tok for tok in self._command_tokens} |
| 719 | self.command_token_map = {tok.token: tok for tok in self._command_tokens} |
| 720 | self.command_id_map = {tok.Id: tok for tok in self._command_tokens} |
| 721 | |
| 722 | # set type tokens |
| 723 | self.type_tokens = [ |
| 724 | TypeToken('str0', '<str0>', 0), |
| 725 | TypeToken('str1', '<str1>', 1), |
| 726 | ] |
| 727 | self.type_name_map = {tok.name: tok for tok in self.type_tokens} |
| 728 | self.type_token_map = {tok.token: tok for tok in self.type_tokens} |
| 729 | self.type_id_map = {tok.Id: tok for tok in self.type_tokens} |
| 730 | |
| 731 | # parse tokens and vocabs from tokenizer |
| 732 | |
| 733 | self._tokens = list(self.text_tokenizer.vocab.keys()) |
| 734 | self._vocab = {k:v for k,v in self.text_tokenizer.vocab.items()} |
| 735 | |
| 736 | self._text_tokens = list(self._tokens) |
| 737 | self._text_token_vocab = {k:v for k,v in self.text_tokenizer.vocab.items()} |
| 738 | |
| 739 | self._command_token_tokens = list(self.command_token_map.keys()) |
| 740 | self._command_token_vocab = {t:Id for Id,t in self.command_id_map.items()} |
| 741 | |
| 742 | self._token_types = list(self.type_token_map.keys()) |
| 743 | self._token_type_vocab = {t:Id for Id, t in self.type_id_map.items()} |
| 744 |