Tokenizer object that handles text tokenization, command tokens, and type tokens. Command tokens and text tokens are stored together in one mapping of size `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first `len(command_tokens)` tokens. Token idx is
| 190 | DEFAULT_TYPE_TOKENS = prep_type_tokens(DEFAULT_TYPE_TOKENS) |
| 191 | |
| 192 | class Tokenizer(object): |
| 193 | """ |
| 194 | Tokenizer object that handles text tokenization, command tokens, and type tokens. |
| 195 | |
| 196 | Command tokens and text tokens are stored together in one mapping of size |
| 197 | `len(text_tokenizer)+len(command_tokens)`. Command tokens are stored as first |
| 198 | `len(command_tokens)` tokens. Token idx is stored at `idx+len(command_tokens)`. |
| 199 | |
| 200 | Token types are stored in a separate mapping of size `len(type_tokens)`. |
| 201 | """ |
| 202 | def __init__(self, text_tokenizer, command_tokens=None, type_tokens=None): |
| 203 | # set text tokenizer |
| 204 | self.text_tokenizer = text_tokenizer |
| 205 | if not hasattr(self, 'num_text_tokens'): |
| 206 | self.num_text_tokens = len(self.text_tokenizer) |
| 207 | |
| 208 | # set command tokens |
| 209 | if command_tokens is None: |
| 210 | command_tokens = DEFAULT_COMMAND_TOKENS |
| 211 | self._command_tokens = command_tokens |
| 212 | self.command_name_map = {tok.name: tok for tok in self._command_tokens} |
| 213 | self.command_token_map = {tok.token: tok for tok in self._command_tokens} |
| 214 | self.command_id_map = {tok.Id: tok for tok in self._command_tokens} |
| 215 | if not hasattr(self, 'num_command_tokens'): |
| 216 | self.num_command_tokens = len(self._command_tokens) |
| 217 | if not hasattr(self, 'num_tokens'): |
| 218 | self.num_tokens = self.num_command_tokens + self.num_text_tokens |
| 219 | |
| 220 | # set type tokens |
| 221 | if type_tokens is None: |
| 222 | type_tokens = DEFAULT_TYPE_TOKENS |
| 223 | self.type_tokens = type_tokens |
| 224 | self.type_name_map = {tok.name: tok for tok in self.type_tokens} |
| 225 | self.type_token_map = {tok.token: tok for tok in self.type_tokens} |
| 226 | self.type_id_map = {tok.Id: tok for tok in self.type_tokens} |
| 227 | if not hasattr(self, 'num_type_tokens'): |
| 228 | self.num_type_tokens = len(self.type_tokens) |
| 229 | |
| 230 | # parse tokens and vocabs from tokenizer |
| 231 | self._tokens = list(self.command_token_map.keys()) + list(self.text_tokenizer.tokens) |
| 232 | self._vocab = {t:Id for Id,t in self.command_id_map.items()} |
| 233 | self._vocab.update({t:Id+self.num_command_tokens for t,Id in self.text_tokenizer.vocab.items()}) |
| 234 | |
| 235 | self._text_tokens = list(self.text_tokenizer.tokens) |
| 236 | self._text_token_vocab = {t:Id+self.num_command_tokens for t,Id in self.text_tokenizer.vocab.items()} |
| 237 | |
| 238 | self._command_token_tokens = list(self.command_token_map.keys()) |
| 239 | self._command_token_vocab = {t:Id for Id,t in self.command_id_map.items()} |
| 240 | |
| 241 | self._token_types = list(self.type_token_map.keys()) |
| 242 | self._token_type_vocab = {t:Id for Id, t in self.type_id_map.items()} |
| 243 | |
| 244 | |
| 245 | def __call__(self, text, process_fn=None): |
| 246 | """run preprocessing and encode text as Ids""" |
| 247 | return self.EncodeAsIds(text, process_fn=process_fn) |
| 248 | |
| 249 | def __len__(self): |