MCPcopy Index your code
hub / github.com/zai-org/CogView / add_data_args

Function add_data_args

arguments.py:235–273  ·  view source on GitHub ↗

Train/valid/test data arguments.

(parser)

Source from the content-addressed store, hash-verified

233
234
235def add_data_args(parser):
236 """Train/valid/test data arguments."""
237
238 group = parser.add_argument_group('data', 'data configurations')
239
240 group.add_argument('--model-parallel-size', type=int, default=1,
241 help='size of the model parallel.')
242 group.add_argument('--shuffle', action='store_true',
243 help='Shuffle data. Shuffling is deterministic '
244 'based on seed and current epoch.')
245 group.add_argument('--train-data', nargs='+', default=None,
246 help='Whitespace separated filenames or corpora names '
247 'for training.')
248
249 group.add_argument('--valid-data', nargs='*', default=None,
250 help="""Filename for validation data.""")
251 group.add_argument('--split', default='1000,1,1',
252 help='comma-separated list of proportions for training,'
253 ' validation, and test split')
254 group.add_argument('--test-data', nargs='*', default=None,
255 help="""Filename for testing""")
256
257 group.add_argument('--num-workers', type=int, default=2,
258 help="""Number of workers to use for dataloading""")
259
260 group.add_argument('--dataset-type', type=str,
261 default='TokenizedDataset',
262 choices=['TokenizedDataset',
263 'TextCodeDataset',
264 'CompactBinaryDataset'
265 ],
266 help='what type of dataset to use')
267
268 group.add_argument('--max-memory-length', type=int, default=2048,
269 help="max memory buffer for attention")
270 group.add_argument('--new-dataset-path', type=str, default=None,
271 help='The folder we will dynamically check for lmdbs during training.')
272
273 return parser
274
275def add_generation_api_args(parser):
276 """generation api arguments"""

Callers 1

get_argsFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected