| 47 | self.vocab_size = self.tokenizer.vocab_size |
| 48 | |
| 49 | def __call__(self, sequence, **kwargs): |
| 50 | return_mask = kwargs.pop('return_mask', False) |
| 51 | |
| 52 | # arguments |
| 53 | _kwargs = {'return_tensors': 'pt'} |
| 54 | if self.seq_len is not None: |
| 55 | _kwargs.update({ |
| 56 | 'padding': 'max_length', |
| 57 | 'truncation': True, |
| 58 | 'max_length': self.seq_len |
| 59 | }) |
| 60 | _kwargs.update(**kwargs) |
| 61 | |
| 62 | # tokenization |
| 63 | if isinstance(sequence, str): |
| 64 | sequence = [sequence] |
| 65 | if self.clean: |
| 66 | sequence = [self._clean(u) for u in sequence] |
| 67 | ids = self.tokenizer(sequence, **_kwargs) |
| 68 | |
| 69 | # output |
| 70 | if return_mask: |
| 71 | return ids.input_ids, ids.attention_mask |
| 72 | else: |
| 73 | return ids.input_ids |
| 74 | |
| 75 | def _clean(self, text): |
| 76 | if self.clean == 'whitespace': |