(args)
| 417 | return make_tokenizer(**tokenizer_args) |
| 418 | |
| 419 | def get_eval_data(args): |
| 420 | val_dataloader = None |
| 421 | if mpu.get_model_parallel_rank() == 0: |
| 422 | eval_batch_size = args.eval_batch_size |
| 423 | eval_batch_size = args.batch_size if eval_batch_size is None else eval_batch_size |
| 424 | seq_len = args.seq_length |
| 425 | valid_data = args.valid_data |
| 426 | valid_data = valid_data[0] if isinstance(valid_data, list) else valid_data |
| 427 | |
| 428 | tokenizer = get_tokenizer(args) |
| 429 | |
| 430 | if not args.cloze_eval: |
| 431 | |
| 432 | with open(valid_data, "rb") as reader: |
| 433 | entire_data = reader.read().decode('utf-8') |
| 434 | num_original_tokens = len(entire_data.strip().split(" ")) |
| 435 | entire_data = get_detokenizer(valid_data)(entire_data) |
| 436 | tokenized_data = tokenizer.EncodeAsIds(entire_data).tokenization |
| 437 | num_tokenized_tokens = len(tokenized_data) |
| 438 | string = 'Original Tokens: %d, Detokenized tokens: %d' % (num_tokenized_tokens, num_original_tokens) |
| 439 | print_rank_0(string) |
| 440 | |
| 441 | eod_token = tokenizer.get_command('pad').Id |
| 442 | val_dataset = LM_Eval_Dataset(tokenized_data, seq_len, eod_token, |
| 443 | args.overlapping_eval) |
| 444 | else: |
| 445 | val_dataset = Lambada_Eval_Dataset(valid_data, tokenizer, seq_len) |
| 446 | num_tokenized_tokens = 0 |
| 447 | num_original_tokens = 0 |
| 448 | val_dataloader = torch.utils.data.DataLoader( |
| 449 | val_dataset, batch_size=eval_batch_size, drop_last=False) |
| 450 | |
| 451 | before = tokenizer.num_tokens |
| 452 | after = before |
| 453 | while after % mpu.get_model_parallel_world_size() != 0: |
| 454 | after += 1 |
| 455 | print_rank_0('> padded vocab (size: {}) with {} dummy tokens (new size: {})'. |
| 456 | format(before, after - before, after)) |
| 457 | eod_token = tokenizer.get_command('pad').Id |
| 458 | num_examples = len(val_dataset) |
| 459 | token_counts = torch.cuda.LongTensor([after, eod_token, num_examples, |
| 460 | num_original_tokens, |
| 461 | num_tokenized_tokens]) |
| 462 | else: |
| 463 | token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0]) |
| 464 | torch.distributed.broadcast(token_counts, |
| 465 | mpu.get_model_parallel_src_rank(), |
| 466 | group=mpu.get_model_parallel_group()) |
| 467 | args.vocab_size = token_counts[0].item() |
| 468 | args.eod_token = token_counts[1].item() |
| 469 | args.num_examples = token_counts[2].item() |
| 470 | args.num_original_tokens = token_counts[3].item() |
| 471 | args.num_tokenized_tokens = token_counts[4].item() |
| 472 | |
| 473 | print('global rank: {} | vocab size: {} | eod token: {} | ' |
| 474 | 'num_examples: {} | num_original_tokens: {} | ' |
| 475 | 'num_tokenized_tokens: {}'.format( |
| 476 | torch.distributed.get_rank(), args.vocab_size, |
no test coverage detected