Arguments pertaining to what data we are going to input our model for training and eval.
| 198 | |
| 199 | @dataclass |
| 200 | class DataArguments: |
| 201 | """ |
| 202 | Arguments pertaining to what data we are going to input our model for training and eval. |
| 203 | """ |
| 204 | |
| 205 | chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."}) |
| 206 | dataset_mixer: Optional[Dict[str, float]] = field( |
| 207 | default=None, |
| 208 | metadata={"help": ("Datasets and their proportions to be used for training ift/rl.")}, |
| 209 | ) |
| 210 | text_column: Optional[str] = field( |
| 211 | default="text", |
| 212 | metadata={"help": "The column name to use for the text in the dataset (only used for continued pretraining)."}, |
| 213 | ) |
| 214 | dataset_splits: Optional[List[str]] = field( |
| 215 | default_factory=lambda: ["train", "test"], |
| 216 | metadata={"help": ("List of train test splits to use in the dataset")}, |
| 217 | ) |
| 218 | dataset_configs: Optional[List[str]] = field( |
| 219 | default=None, |
| 220 | metadata={"help": "List of dataset config names. If given must be the same length as 'dataset_mixer' keys."}, |
| 221 | ) |
| 222 | preprocessing_num_workers: Optional[int] = field( |
| 223 | default=None, |
| 224 | metadata={"help": "The number of processes to use for the preprocessing."}, |
| 225 | ) |
| 226 | truncation_side: Optional[str] = field( |
| 227 | default=None, metadata={"help": "Truncation side to use for the tokenizer."} |
| 228 | ) |
| 229 | auto_insert_empty_system_msg: bool = field( |
| 230 | default=True, |
| 231 | metadata={ |
| 232 | "help": ( |
| 233 | "Whether to automatically insert an empty system message as the first message if `system` is mentioned in the chat template." |
| 234 | ) |
| 235 | }, |
| 236 | ) |
| 237 | |
| 238 | |
| 239 | @dataclass |
no outgoing calls