| 797 | |
| 798 | |
| 799 | class GPT2BPETokenizer(Tokenizer): |
| 800 | def __init__(self, cache_dir=None, **kwargs): |
| 801 | self.text_tokenizer = GPT2Tokenizer.from_pretrained('gpt2', |
| 802 | cache_dir=cache_dir) |
| 803 | |
| 804 | #disable max len warnings by increasing max len |
| 805 | self.text_tokenizer.max_len = int(1e12) |
| 806 | self.num_command_tokens = 2 |
| 807 | self.num_tokens = len(self.text_tokenizer.encoder) |
| 808 | self.num_text_tokens = self.num_tokens-1 |
| 809 | self.num_type_tokens = 2 |
| 810 | |
| 811 | self._command_tokens = [ |
| 812 | CommandToken('pad', '<|endoftext|>', self.text_tokenizer.encoder['<|endoftext|>']), |
| 813 | CommandToken('eos', '<|endoftext|>', self.text_tokenizer.encoder['<|endoftext|>']), |
| 814 | ] |
| 815 | self.command_name_map = {tok.name: tok for tok in self._command_tokens} |
| 816 | self.command_token_map = {tok.token: tok for tok in self._command_tokens} |
| 817 | self.command_id_map = {tok.Id: tok for tok in self._command_tokens} |
| 818 | |
| 819 | self.type_tokens = [ |
| 820 | TypeToken('str0', '<str0>', 0), |
| 821 | TypeToken('str1', '<str1>', 1), |
| 822 | ] |
| 823 | self.type_name_map = {tok.name: tok for tok in self.type_tokens} |
| 824 | self.type_token_map = {tok.token: tok for tok in self.type_tokens} |
| 825 | self.type_id_map = {tok.Id: tok for tok in self.type_tokens} |
| 826 | |
| 827 | self._tokens = list(self.text_tokenizer.encoder.keys()) |
| 828 | self._vocab = {k:v for k,v in self.text_tokenizer.encoder.items()} |
| 829 | |
| 830 | self._text_tokens = list(self._tokens) |
| 831 | self._text_token_vocab = {k:v for k,v in self.text_tokenizer.encoder.items()} |
| 832 | |
| 833 | self._command_token_tokens = list(self.command_token_map.keys()) |
| 834 | self._command_token_vocab = {t:Id for Id,t in self.command_id_map.items()} |
| 835 | |
| 836 | self._token_types = list(self.type_token_map.keys()) |
| 837 | self._token_type_vocab = {t:Id for Id, t in self.type_id_map.items()} |
| 838 | |
| 839 | def EncodeAsIds(self, text, process_fn=None): |
| 840 | processed_text = text |
| 841 | if process_fn is not None: |
| 842 | processed_text = process_fn(processed_text) |
| 843 | Ids = self.text_tokenizer.encode(processed_text) |
| 844 | #return Tokenization(Ids, processed_text, text) |
| 845 | tokenization = Tokenization(Ids, processed_text, text) |
| 846 | tokenization.set_command_tokens(self._command_tokens) |
| 847 | return tokenization |
| 848 | |
| 849 | |
| 850 | def EncodeAsTokens(self, text, process_fn=None): |
| 851 | processed_text = text |
| 852 | if process_fn is not None: |
| 853 | processed_text = process_fn(processed_text) |
| 854 | tokens = [] |
| 855 | for token in re.findall(self.text_tokenizer.pat, processed_text): |
| 856 | token = ''.join(self.text_tokenizer.bye_encoder[b] for b in token.encode('utf-8')) |