Train/valid/test data arguments.
(parser)
| 263 | |
| 264 | |
| 265 | def add_data_args(parser): |
| 266 | """Train/valid/test data arguments.""" |
| 267 | |
| 268 | group = parser.add_argument_group('data', 'data configurations') |
| 269 | |
| 270 | group.add_argument('--model-parallel-size', type=int, default=1, |
| 271 | help='size of the model parallel.') |
| 272 | group.add_argument('--shuffle', action='store_true', |
| 273 | help='Shuffle data. Shuffling is deterministic ' |
| 274 | 'based on seed and current epoch.') |
| 275 | group.add_argument('--filter-english', action='store_true') |
| 276 | group.add_argument('--train-data', nargs='+', default=None, |
| 277 | help='Whitespace separated filenames or corpora names ' |
| 278 | 'for training.') |
| 279 | group.add_argument('--valid-data', nargs='*', default=None, |
| 280 | help="""Filename for validation data.""") |
| 281 | group.add_argument('--test-data', nargs='*', default=None, |
| 282 | help="""Filename for testing""") |
| 283 | group.add_argument('--data-dir', type=str, default=None, help="The data path to all the data files") |
| 284 | group.add_argument('--input-data-sizes-file', type=str, default='sizes.txt', |
| 285 | help='the filename containing all the shards sizes') |
| 286 | |
| 287 | group.add_argument('--delim', default=',', |
| 288 | help='delimiter used to parse csv data files') |
| 289 | group.add_argument('--text-key', default='sentence', |
| 290 | help='key to use to extract text from json/csv') |
| 291 | group.add_argument('--eval-text-key', default=None, |
| 292 | help='key to use to extract text from ' |
| 293 | 'json/csv evaluation datasets') |
| 294 | group.add_argument('--split', default='1000,1,1', |
| 295 | help='comma-separated list of proportions for training,' |
| 296 | ' validation, and test split') |
| 297 | |
| 298 | group.add_argument('--no-lazy-loader', action='store_true', |
| 299 | help='whether to lazy read the data set') |
| 300 | group.add_argument('--half-lazy-loader', action='store_true') |
| 301 | group.add_argument('--loader-scatter', type=int, default=None, help='Number of scatters to use for dataloaders') |
| 302 | group.add_argument('--loose-json', action='store_true', |
| 303 | help='Use loose json (one json-formatted string per ' |
| 304 | 'newline), instead of tight json (data file is one ' |
| 305 | 'json string)') |
| 306 | group.add_argument('--presplit-sentences', action='store_true', |
| 307 | help='Dataset content consists of documents where ' |
| 308 | 'each document consists of newline separated sentences') |
| 309 | group.add_argument('--num-workers', type=int, default=2, |
| 310 | help="""Number of workers to use for dataloading""") |
| 311 | group.add_argument('--tokenizer-model-type', type=str, |
| 312 | default=None, |
| 313 | help="Model type to use for sentencepiece tokenization \ |
| 314 | (one of ['bpe', 'char', 'unigram', 'word']) or \ |
| 315 | bert vocab to use for BertWordPieceTokenizer (one of \ |
| 316 | ['bert-large-uncased', 'bert-large-cased', etc.])") |
| 317 | group.add_argument('--tokenizer-path', type=str, default='tokenizer.model', |
| 318 | help='path used to save/load sentencepiece tokenization ' |
| 319 | 'models') |
| 320 | group.add_argument('--tokenizer-type', type=str, |
| 321 | default='BertWordPieceTokenizer', |
| 322 | choices=['CharacterLevelTokenizer', |