Train/valid/test data arguments.
(parser)
| 233 | |
| 234 | |
| 235 | def add_data_args(parser): |
| 236 | """Train/valid/test data arguments.""" |
| 237 | |
| 238 | group = parser.add_argument_group('data', 'data configurations') |
| 239 | |
| 240 | group.add_argument('--model-parallel-size', type=int, default=1, |
| 241 | help='size of the model parallel.') |
| 242 | group.add_argument('--shuffle', action='store_true', |
| 243 | help='Shuffle data. Shuffling is deterministic ' |
| 244 | 'based on seed and current epoch.') |
| 245 | group.add_argument('--train-data', nargs='+', default=None, |
| 246 | help='Whitespace separated filenames or corpora names ' |
| 247 | 'for training.') |
| 248 | |
| 249 | group.add_argument('--valid-data', nargs='*', default=None, |
| 250 | help="""Filename for validation data.""") |
| 251 | group.add_argument('--split', default='1000,1,1', |
| 252 | help='comma-separated list of proportions for training,' |
| 253 | ' validation, and test split') |
| 254 | group.add_argument('--test-data', nargs='*', default=None, |
| 255 | help="""Filename for testing""") |
| 256 | |
| 257 | group.add_argument('--num-workers', type=int, default=2, |
| 258 | help="""Number of workers to use for dataloading""") |
| 259 | |
| 260 | group.add_argument('--dataset-type', type=str, |
| 261 | default='TokenizedDataset', |
| 262 | choices=['TokenizedDataset', |
| 263 | 'TextCodeDataset', |
| 264 | 'CompactBinaryDataset' |
| 265 | ], |
| 266 | help='what type of dataset to use') |
| 267 | |
| 268 | group.add_argument('--max-memory-length', type=int, default=2048, |
| 269 | help="max memory buffer for attention") |
| 270 | group.add_argument('--new-dataset-path', type=str, default=None, |
| 271 | help='The folder we will dynamically check for lmdbs during training.') |
| 272 | |
| 273 | return parser |
| 274 | |
| 275 | def add_generation_api_args(parser): |
| 276 | """generation api arguments""" |