(self)
| 161 | self.output_column_name = output_column_name |
| 162 | |
| 163 | def __iter__(self): |
| 164 | iterator = iter(self.dataset) |
| 165 | more_examples = True |
| 166 | while more_examples: |
| 167 | buffer, buffer_len = [], 0 |
| 168 | while True: |
| 169 | if buffer_len >= self.max_buffer_size: |
| 170 | break |
| 171 | try: |
| 172 | buffer.append(prepare_sample_text(next(iterator), self.input_column_name, self.output_column_name)) |
| 173 | buffer_len += len(buffer[-1]) |
| 174 | except StopIteration: |
| 175 | if self.infinite: |
| 176 | iterator = iter(self.dataset) |
| 177 | else: |
| 178 | more_examples = False |
| 179 | break |
| 180 | tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"] |
| 181 | all_token_ids = [] |
| 182 | for tokenized_input in tokenized_inputs: |
| 183 | all_token_ids.extend(tokenized_input + [self.concat_token_id]) |
| 184 | for i in range(0, len(all_token_ids), self.seq_length): |
| 185 | input_ids = all_token_ids[i : i + self.seq_length] |
| 186 | if len(input_ids) == self.seq_length: |
| 187 | self.current_size += 1 |
| 188 | yield { |
| 189 | "input_ids": torch.LongTensor(input_ids), |
| 190 | "labels": torch.LongTensor(input_ids), |
| 191 | } |
| 192 | |
| 193 | |
| 194 | def create_datasets(tokenizer, args): |
nothing calls this directly
no test coverage detected