encode text using text tokenizer and shift Id values for command tokens
(self, text, process_fn=None)
| 323 | return self._text_token_vocab |
| 324 | |
| 325 | def EncodeAsIds(self, text, process_fn=None): |
| 326 | """ |
| 327 | encode text using text tokenizer and shift Id values for command tokens |
| 328 | """ |
| 329 | processed_text = text |
| 330 | if process_fn is not None: |
| 331 | processed_text = process_fn(processed_text) |
| 332 | |
| 333 | def split_on_token(tok_extended: CommandToken, text): |
| 334 | result = [] |
| 335 | tok = tok_extended.token |
| 336 | split_text = text.split(tok) |
| 337 | for i, sub_text in enumerate(split_text): |
| 338 | # CommandToken can control whitespace stripping around them. |
| 339 | # We use them for GPT2 and Roberta to have different behavior depending on the special token |
| 340 | # Cf. https://github.com/huggingface/transformers/pull/2778 |
| 341 | # and https://github.com/huggingface/transformers/issues/3788 |
| 342 | # Strip white spaces on the right |
| 343 | if tok_extended.rstrip and i > 0: |
| 344 | # A bit counter-intuitive but we strip the left of the string |
| 345 | # since tok_extended.rstrip means the special token is eating all white spaces on its right |
| 346 | sub_text = sub_text.lstrip() |
| 347 | # Strip white spaces on the left |
| 348 | if tok_extended.lstrip and i < len(split_text) - 1: |
| 349 | sub_text = sub_text.rstrip() # Opposite here |
| 350 | |
| 351 | if i == 0 and not sub_text: |
| 352 | result.append(tok) |
| 353 | elif i == len(split_text) - 1: |
| 354 | if sub_text: |
| 355 | result.append(sub_text) |
| 356 | else: |
| 357 | pass |
| 358 | else: |
| 359 | if sub_text: |
| 360 | result.append(sub_text) |
| 361 | result.append(tok) |
| 362 | return result |
| 363 | |
| 364 | def split_on_tokens(tok_list, text): |
| 365 | if not text.strip(): |
| 366 | return [] |
| 367 | if not tok_list: |
| 368 | return self.text_tokenizer.encode(text) |
| 369 | |
| 370 | tokenized_text = [] |
| 371 | text_list = [text] |
| 372 | for tok in tok_list: |
| 373 | tokenized_text = [] |
| 374 | for sub_text in text_list: |
| 375 | if sub_text not in self._command_token_tokens: |
| 376 | tokenized_text.extend(split_on_token(tok, sub_text)) |
| 377 | else: |
| 378 | tokenized_text.append(sub_text) |
| 379 | text_list = tokenized_text |
| 380 | |
| 381 | return list( |
| 382 | itertools.chain.from_iterable( |
no test coverage detected