(args)
| 272 | |
| 273 | |
| 274 | def prepare_tokenizer(args): |
| 275 | |
| 276 | tokenizer = get_tokenizer(args) |
| 277 | |
| 278 | num_tokens = tokenizer.num_tokens |
| 279 | before = num_tokens |
| 280 | after = before |
| 281 | multiple = args.make_vocab_size_divisible_by * \ |
| 282 | mpu.get_model_parallel_world_size() |
| 283 | while (after % multiple) != 0: |
| 284 | after += 1 |
| 285 | print_rank_0('> padded vocab (size: {}) with {} dummy ' |
| 286 | 'tokens (new size: {})'.format( |
| 287 | before, after - before, after)) |
| 288 | |
| 289 | args.vocab_size = after |
| 290 | print("prepare tokenizer done", flush=True) |
| 291 | |
| 292 | return tokenizer |
| 293 | |
| 294 | |
| 295 | def main(): |
no test coverage detected