(self, raw_data, tokenizer: transformers.PreTrainedTokenizer)
| 181 | """Dataset for supervised fine-tuning.""" |
| 182 | |
| 183 | def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer): |
| 184 | super(SupervisedDataset, self).__init__() |
| 185 | |
| 186 | rank0_print("Formatting inputs...") |
| 187 | sources = [example["conversations"] for example in raw_data] |
| 188 | data_dict = preprocess(sources, tokenizer) |
| 189 | |
| 190 | self.input_ids = data_dict["input_ids"] |
| 191 | self.labels = data_dict["labels"] |
| 192 | self.attention_mask = data_dict["attention_mask"] |
| 193 | |
| 194 | def __len__(self): |
| 195 | return len(self.input_ids) |
no test coverage detected