(args)
| 118 | |
| 119 | |
| 120 | def prepare_tokenizer(args): |
| 121 | add_sentinel_token = 0 |
| 122 | if args.sentinel_token: |
| 123 | add_sentinel_token = args.max_position_embeddings |
| 124 | tokenizer = make_tokenizer(args.tokenizer_type, None, args.tokenizer_path, args.vocab_size, |
| 125 | args.tokenizer_model_type, add_block_symbols=args.block_lm, cache_dir=args.cache_dir, |
| 126 | add_sentinel_token=add_sentinel_token, add_task_mask=args.task_mask, |
| 127 | add_decoder_mask=args.block_mask_prob > 0.0 or args.context_mask_ratio > 0.0, |
| 128 | fix_command_token=args.fix_command_token) |
| 129 | if mpu.get_model_parallel_rank() == 0: |
| 130 | num_tokens = tokenizer.num_tokens |
| 131 | eod_token = tokenizer.get_command('eos').Id |
| 132 | assert eod_token == tokenizer.get_command('pad').Id |
| 133 | before = num_tokens |
| 134 | after = before |
| 135 | multiple = args.make_vocab_size_divisible_by |
| 136 | while (after % multiple) != 0: |
| 137 | after += 1 |
| 138 | print_rank_0('> padded vocab (size: {}) with {} dummy ' |
| 139 | 'tokens (new size: {})'.format(before, after - before, after)) |
| 140 | print_rank_0('> found end-of-document token: {}'.format(eod_token)) |
| 141 | token_counts = torch.cuda.LongTensor([after, eod_token]) |
| 142 | else: |
| 143 | token_counts = torch.cuda.LongTensor([0, 0]) |
| 144 | # Broadcast num tokens. |
| 145 | torch.distributed.broadcast(token_counts, |
| 146 | mpu.get_model_parallel_src_rank(), |
| 147 | group=mpu.get_model_parallel_group()) |
| 148 | num_tokens = token_counts[0].item() |
| 149 | eod_token = token_counts[1].item() |
| 150 | args.vocab_size, args.eod_token = num_tokens, eod_token |
| 151 | return tokenizer |
| 152 | |
| 153 | |
| 154 | def make_data_loader(dataset, tokenizer, batch_size, num_iters, args, shuffle=False, block_collate=False): |
no test coverage detected