Training arguments.
(parser)
| 94 | |
| 95 | |
| 96 | def add_training_args(parser): |
| 97 | """Training arguments.""" |
| 98 | |
| 99 | group = parser.add_argument_group('train', 'training configurations') |
| 100 | |
| 101 | group.add_argument('--experiment-name', type=str, default="CogView", |
| 102 | help="The experiment name for summary and checkpoint") |
| 103 | group.add_argument('--batch-size', type=int, default=4, |
| 104 | help='Data Loader batch size') |
| 105 | group.add_argument('--weight-decay', type=float, default=0.01, |
| 106 | help='weight decay coefficient for L2 regularization') |
| 107 | group.add_argument('--checkpoint-activations', action='store_true', |
| 108 | help='checkpoint activation to allow for training ' |
| 109 | 'with larger models and sequences') |
| 110 | group.add_argument('--checkpoint-num-layers', type=int, default=1, |
| 111 | help='chunk size (number of layers) for checkpointing') |
| 112 | group.add_argument('--deepspeed-activation-checkpointing', action='store_true', |
| 113 | help='uses activation checkpointing from deepspeed') |
| 114 | group.add_argument('--clip-grad', type=float, default=1.0, |
| 115 | help='gradient clipping') |
| 116 | group.add_argument('--train-iters', type=int, default=1000000, |
| 117 | help='total number of iterations to train over all training runs') |
| 118 | group.add_argument('--log-interval', type=int, default=50, |
| 119 | help='report interval') |
| 120 | group.add_argument('--exit-interval', type=int, default=None, |
| 121 | help='Exit the program after this many new iterations.') |
| 122 | group.add_argument('--summary-dir', type=str, default="", help="The directory to store the summary") |
| 123 | group.add_argument('--seed', type=int, default=1234, |
| 124 | help='random seed') |
| 125 | group.add_argument('--img-tokenizer-path', type=str, default=None, |
| 126 | help='The checkpoint file path of image tokenizer.') |
| 127 | group.add_argument('--img-tokenizer-num-tokens', type=int, default=None, |
| 128 | help='The num tokens of image tokenizer. ONLY use for pretraining with img-tokenizer UNKNOW.') |
| 129 | # Batch prodecuer arguments |
| 130 | group.add_argument('--reset-position-ids', action='store_true', |
| 131 | help='Reset posistion ids after end-of-document token.') |
| 132 | group.add_argument('--reset-attention-mask', action='store_true', |
| 133 | help='Reset self attention maske after ' |
| 134 | 'end-of-document token.') |
| 135 | |
| 136 | # Learning rate. |
| 137 | group.add_argument('--lr-decay-iters', type=int, default=None, |
| 138 | help='number of iterations to decay LR over,' |
| 139 | ' If None defaults to `--train-iters`*`--epochs`') |
| 140 | group.add_argument('--lr-decay-style', type=str, default='linear', |
| 141 | choices=['constant', 'linear', 'cosine', 'exponential'], |
| 142 | help='learning rate decay function') |
| 143 | group.add_argument('--lr-decay-ratio', type=float, default=0.1) |
| 144 | group.add_argument('--lr', type=float, default=1.0e-4, |
| 145 | help='initial learning rate') |
| 146 | group.add_argument('--warmup', type=float, default=0.01, |
| 147 | help='percentage of data to warmup on (.01 = 1% of all ' |
| 148 | 'training iters). Default 0.01') |
| 149 | # model checkpointing |
| 150 | group.add_argument('--save', type=str, default=None, |
| 151 | help='Output directory to save checkpoints to.') |
| 152 | group.add_argument('--save-interval', type=int, default=5000, |
| 153 | help='number of iterations between saves') |