hub / github.com/deepspeedai/DeepSpeedExamples / Tokenizer

Class Tokenizer

Megatron-LM/data_utils/tokenization.py:192–387 · view source on GitHub ↗

Tokenizer object that handles text tokenization, command tokens, and type tokens. Command tokens and text tokens are stored together in one mapping of size `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first `len(command_tokens)` tokens. Token idx is

Source from the content-addressed store, hash-verified

190	DEFAULT_TYPE_TOKENS = prep_type_tokens(DEFAULT_TYPE_TOKENS)
191
192	class Tokenizer(object):
193	"""
194	Tokenizer object that handles text tokenization, command tokens, and type tokens.
195
196	Command tokens and text tokens are stored together in one mapping of size
197	`len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first
198	`len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`.
199
200	Token types are stored in a separate mapping of size `len(type_tokens)`.
201	"""
202	def __init__(self, text_tokenizer, command_tokens=None, type_tokens=None):
203	# set text tokenizer
204	self.text_tokenizer = text_tokenizer
205	if not hasattr(self, 'num_text_tokens'):
206	self.num_text_tokens = len(self.text_tokenizer)
207
208	# set command tokens
209	if command_tokens is None:
210	command_tokens = DEFAULT_COMMAND_TOKENS
211	self._command_tokens = command_tokens
212	self.command_name_map = {tok.name: tok for tok in self._command_tokens}
213	self.command_token_map = {tok.token: tok for tok in self._command_tokens}
214	self.command_id_map = {tok.Id: tok for tok in self._command_tokens}
215	if not hasattr(self, 'num_command_tokens'):
216	self.num_command_tokens = len(self._command_tokens)
217	if not hasattr(self, 'num_tokens'):
218	self.num_tokens = self.num_command_tokens + self.num_text_tokens
219
220	# set type tokens
221	if type_tokens is None:
222	type_tokens = DEFAULT_TYPE_TOKENS
223	self.type_tokens = type_tokens
224	self.type_name_map = {tok.name: tok for tok in self.type_tokens}
225	self.type_token_map = {tok.token: tok for tok in self.type_tokens}
226	self.type_id_map = {tok.Id: tok for tok in self.type_tokens}
227	if not hasattr(self, 'num_type_tokens'):
228	self.num_type_tokens = len(self.type_tokens)
229
230	# parse tokens and vocabs from tokenizer
231	self._tokens = list(self.command_token_map.keys()) + list(self.text_tokenizer.tokens)
232	self._vocab = {t:Id for Id,t in self.command_id_map.items()}
233	self._vocab.update({t:Id+self.num_command_tokens for t,Id in self.text_tokenizer.vocab.items()})
234
235	self._text_tokens = list(self.text_tokenizer.tokens)
236	self._text_token_vocab = {t:Id+self.num_command_tokens for t,Id in self.text_tokenizer.vocab.items()}
237
238	self._command_token_tokens = list(self.command_token_map.keys())
239	self._command_token_vocab = {t:Id for Id,t in self.command_id_map.items()}
240
241	self._token_types = list(self.type_token_map.keys())
242	self._token_type_vocab = {t:Id for Id, t in self.type_id_map.items()}
243
244
245	def __call__(self, text, process_fn=None):
246	"""run preprocessing and encode text as Ids"""
247	return self.EncodeAsIds(text, process_fn=process_fn)
248
249	def __len__(self):

Callers 1

make_tokenizerFunction · 0.70

Calls

no outgoing calls

Tested by

no test coverage detected