Model arguments
(parser)
| 23 | |
| 24 | |
| 25 | def add_model_config_args(parser): |
| 26 | """Model arguments""" |
| 27 | |
| 28 | group = parser.add_argument_group('model', 'model configuration') |
| 29 | |
| 30 | group.add_argument('--attention-dropout', type=float, default=0.1, |
| 31 | help='dropout probability for attention weights') |
| 32 | group.add_argument('--num-attention-heads', type=int, default=16, |
| 33 | help='num of transformer attention heads') |
| 34 | group.add_argument('--hidden-size', type=int, default=1024, |
| 35 | help='tansformer hidden size') |
| 36 | group.add_argument('--num-layers', type=int, default=24, |
| 37 | help='num decoder layers') |
| 38 | group.add_argument('--layernorm-epsilon', type=float, default=1e-5, |
| 39 | help='layer norm epsilon') |
| 40 | group.add_argument('--hidden-dropout', type=float, default=0.1, |
| 41 | help='dropout probability for hidden state transformer') |
| 42 | group.add_argument('--max-position-embeddings', type=int, default=512, |
| 43 | help='maximum number of position embeddings to use') |
| 44 | group.add_argument('--vocab-size', type=int, default=30522, |
| 45 | help='vocab size to use for non-character-level ' |
| 46 | 'tokenization. This value will only be used when ' |
| 47 | 'creating a tokenizer') |
| 48 | group.add_argument('--deep-init', action='store_true', |
| 49 | help='initialize bert model similar to gpt2 model.' |
| 50 | 'scales initialization of projection layers by a ' |
| 51 | 'factor of 1/sqrt(2N). Necessary to train bert ' |
| 52 | 'models larger than BERT-Large.') |
| 53 | group.add_argument('--make-vocab-size-divisible-by', type=int, default=128, |
| 54 | help='Pad the vocab size to be divisible by this value.' |
| 55 | 'This is added for computational efficieny reasons.') |
| 56 | group.add_argument('--cpu-optimizer', action='store_true', |
| 57 | help='Run optimizer on CPU') |
| 58 | group.add_argument('--cpu_torch_adam', action='store_true', |
| 59 | help='Use Torch Adam as optimizer on CPU.') |
| 60 | |
| 61 | group.add_argument('--max-position-embeddings-finetune', type=int, default=-1, |
| 62 | help='maximum number of position embeddings to use in finetune') |
| 63 | |
| 64 | return parser |
| 65 | |
| 66 | |
| 67 | def add_fp16_config_args(parser): |