(
self, raw_data, data_args, tokenizer: transformers.PreTrainedTokenizer
)
| 317 | """Dataset for supervised fine-tuning.""" |
| 318 | |
| 319 | def __init__( |
| 320 | self, raw_data, data_args, tokenizer: transformers.PreTrainedTokenizer |
| 321 | ): |
| 322 | super(SupervisedDataset, self).__init__() |
| 323 | |
| 324 | rank0_print("Formatting inputs...") |
| 325 | sources = [example["conversations"] for example in raw_data] |
| 326 | data_dict = preprocess(sources, tokenizer, data_args) |
| 327 | |
| 328 | self.input_ids = data_dict["input_ids"] |
| 329 | self.labels = data_dict["labels"] |
| 330 | self.attention_mask = data_dict["attention_mask"] |
| 331 | |
| 332 | def __len__(self): |
| 333 | return len(self.input_ids) |
no test coverage detected