MCPcopy Index your code
hub / github.com/deepspeedai/DeepSpeedExamples / GPT2BPETokenizer

Class GPT2BPETokenizer

Megatron-LM/data_utils/tokenization.py:799–889  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

797
798
799class GPT2BPETokenizer(Tokenizer):
800 def __init__(self, cache_dir=None, **kwargs):
801 self.text_tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
802 cache_dir=cache_dir)
803
804 #disable max len warnings by increasing max len
805 self.text_tokenizer.max_len = int(1e12)
806 self.num_command_tokens = 2
807 self.num_tokens = len(self.text_tokenizer.encoder)
808 self.num_text_tokens = self.num_tokens-1
809 self.num_type_tokens = 2
810
811 self._command_tokens = [
812 CommandToken('pad', '<|endoftext|>', self.text_tokenizer.encoder['<|endoftext|>']),
813 CommandToken('eos', '<|endoftext|>', self.text_tokenizer.encoder['<|endoftext|>']),
814 ]
815 self.command_name_map = {tok.name: tok for tok in self._command_tokens}
816 self.command_token_map = {tok.token: tok for tok in self._command_tokens}
817 self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
818
819 self.type_tokens = [
820 TypeToken('str0', '<str0>', 0),
821 TypeToken('str1', '<str1>', 1),
822 ]
823 self.type_name_map = {tok.name: tok for tok in self.type_tokens}
824 self.type_token_map = {tok.token: tok for tok in self.type_tokens}
825 self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
826
827 self._tokens = list(self.text_tokenizer.encoder.keys())
828 self._vocab = {k:v for k,v in self.text_tokenizer.encoder.items()}
829
830 self._text_tokens = list(self._tokens)
831 self._text_token_vocab = {k:v for k,v in self.text_tokenizer.encoder.items()}
832
833 self._command_token_tokens = list(self.command_token_map.keys())
834 self._command_token_vocab = {t:Id for Id,t in self.command_id_map.items()}
835
836 self._token_types = list(self.type_token_map.keys())
837 self._token_type_vocab = {t:Id for Id, t in self.type_id_map.items()}
838
839 def EncodeAsIds(self, text, process_fn=None):
840 processed_text = text
841 if process_fn is not None:
842 processed_text = process_fn(processed_text)
843 Ids = self.text_tokenizer.encode(processed_text)
844 #return Tokenization(Ids, processed_text, text)
845 tokenization = Tokenization(Ids, processed_text, text)
846 tokenization.set_command_tokens(self._command_tokens)
847 return tokenization
848
849
850 def EncodeAsTokens(self, text, process_fn=None):
851 processed_text = text
852 if process_fn is not None:
853 processed_text = process_fn(processed_text)
854 tokens = []
855 for token in re.findall(self.text_tokenizer.pat, processed_text):
856 token = ''.join(self.text_tokenizer.bye_encoder[b] for b in token.encode('utf-8'))

Callers 1

make_tokenizerFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected