MCPcopy
hub / github.com/deepspeedai/DeepSpeedExamples / Tokenizer

Class Tokenizer

Megatron-LM/data_utils/tokenization.py:192–387  ·  view source on GitHub ↗

Tokenizer object that handles text tokenization, command tokens, and type tokens. Command tokens and text tokens are stored together in one mapping of size `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first `len(command_tokens)` tokens. Token idx is

Source from the content-addressed store, hash-verified

190DEFAULT_TYPE_TOKENS = prep_type_tokens(DEFAULT_TYPE_TOKENS)
191
192class Tokenizer(object):
193 """
194 Tokenizer object that handles text tokenization, command tokens, and type tokens.
195
196 Command tokens and text tokens are stored together in one mapping of size
197 `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first
198 `len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`.
199
200 Token types are stored in a separate mapping of size `len(type_tokens)`.
201 """
202 def __init__(self, text_tokenizer, command_tokens=None, type_tokens=None):
203 # set text tokenizer
204 self.text_tokenizer = text_tokenizer
205 if not hasattr(self, 'num_text_tokens'):
206 self.num_text_tokens = len(self.text_tokenizer)
207
208 # set command tokens
209 if command_tokens is None:
210 command_tokens = DEFAULT_COMMAND_TOKENS
211 self._command_tokens = command_tokens
212 self.command_name_map = {tok.name: tok for tok in self._command_tokens}
213 self.command_token_map = {tok.token: tok for tok in self._command_tokens}
214 self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
215 if not hasattr(self, 'num_command_tokens'):
216 self.num_command_tokens = len(self._command_tokens)
217 if not hasattr(self, 'num_tokens'):
218 self.num_tokens = self.num_command_tokens + self.num_text_tokens
219
220 # set type tokens
221 if type_tokens is None:
222 type_tokens = DEFAULT_TYPE_TOKENS
223 self.type_tokens = type_tokens
224 self.type_name_map = {tok.name: tok for tok in self.type_tokens}
225 self.type_token_map = {tok.token: tok for tok in self.type_tokens}
226 self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
227 if not hasattr(self, 'num_type_tokens'):
228 self.num_type_tokens = len(self.type_tokens)
229
230 # parse tokens and vocabs from tokenizer
231 self._tokens = list(self.command_token_map.keys()) + list(self.text_tokenizer.tokens)
232 self._vocab = {t:Id for Id,t in self.command_id_map.items()}
233 self._vocab.update({t:Id+self.num_command_tokens for t,Id in self.text_tokenizer.vocab.items()})
234
235 self._text_tokens = list(self.text_tokenizer.tokens)
236 self._text_token_vocab = {t:Id+self.num_command_tokens for t,Id in self.text_tokenizer.vocab.items()}
237
238 self._command_token_tokens = list(self.command_token_map.keys())
239 self._command_token_vocab = {t:Id for Id,t in self.command_id_map.items()}
240
241 self._token_types = list(self.type_token_map.keys())
242 self._token_type_vocab = {t:Id for Id, t in self.type_id_map.items()}
243
244
245 def __call__(self, text, process_fn=None):
246 """run preprocessing and encode text as Ids"""
247 return self.EncodeAsIds(text, process_fn=process_fn)
248
249 def __len__(self):

Callers 1

make_tokenizerFunction · 0.70

Calls

no outgoing calls

Tested by

no test coverage detected