Tokenize a list of strings.
(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer)
| 113 | |
| 114 | |
| 115 | def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict: |
| 116 | """Tokenize a list of strings.""" |
| 117 | tokenized_list = [ |
| 118 | tokenizer( |
| 119 | text, |
| 120 | # return_tensors="pt", |
| 121 | max_length=tokenizer.model_max_length, |
| 122 | truncation=True, |
| 123 | ) |
| 124 | for text in strings |
| 125 | ] |
| 126 | input_ids = labels = [np.array(tokenized.input_ids) for tokenized in tokenized_list] |
| 127 | input_ids_lens = labels_lens = [ |
| 128 | len(tokenized.input_ids) for tokenized in tokenized_list |
| 129 | ] |
| 130 | |
| 131 | return dict( |
| 132 | input_ids=input_ids, |
| 133 | labels=labels, |
| 134 | input_ids_lens=input_ids_lens, |
| 135 | labels_lens=labels_lens, |
| 136 | ) |
| 137 | |
| 138 | |
| 139 | def preprocess( |