MCPcopy
hub / github.com/deepspeedai/DeepSpeedExamples / get_eval_data

Function get_eval_data

Megatron-LM/evaluate_gpt2.py:419–479  ·  view source on GitHub ↗
(args)

Source from the content-addressed store, hash-verified

417 return make_tokenizer(**tokenizer_args)
418
419def get_eval_data(args):
420 val_dataloader = None
421 if mpu.get_model_parallel_rank() == 0:
422 eval_batch_size = args.eval_batch_size
423 eval_batch_size = args.batch_size if eval_batch_size is None else eval_batch_size
424 seq_len = args.seq_length
425 valid_data = args.valid_data
426 valid_data = valid_data[0] if isinstance(valid_data, list) else valid_data
427
428 tokenizer = get_tokenizer(args)
429
430 if not args.cloze_eval:
431
432 with open(valid_data, "rb") as reader:
433 entire_data = reader.read().decode('utf-8')
434 num_original_tokens = len(entire_data.strip().split(" "))
435 entire_data = get_detokenizer(valid_data)(entire_data)
436 tokenized_data = tokenizer.EncodeAsIds(entire_data).tokenization
437 num_tokenized_tokens = len(tokenized_data)
438 string = 'Original Tokens: %d, Detokenized tokens: %d' % (num_tokenized_tokens, num_original_tokens)
439 print_rank_0(string)
440
441 eod_token = tokenizer.get_command('pad').Id
442 val_dataset = LM_Eval_Dataset(tokenized_data, seq_len, eod_token,
443 args.overlapping_eval)
444 else:
445 val_dataset = Lambada_Eval_Dataset(valid_data, tokenizer, seq_len)
446 num_tokenized_tokens = 0
447 num_original_tokens = 0
448 val_dataloader = torch.utils.data.DataLoader(
449 val_dataset, batch_size=eval_batch_size, drop_last=False)
450
451 before = tokenizer.num_tokens
452 after = before
453 while after % mpu.get_model_parallel_world_size() != 0:
454 after += 1
455 print_rank_0('> padded vocab (size: {}) with {} dummy tokens (new size: {})'.
456 format(before, after - before, after))
457 eod_token = tokenizer.get_command('pad').Id
458 num_examples = len(val_dataset)
459 token_counts = torch.cuda.LongTensor([after, eod_token, num_examples,
460 num_original_tokens,
461 num_tokenized_tokens])
462 else:
463 token_counts = torch.cuda.LongTensor([0, 0, 0, 0, 0])
464 torch.distributed.broadcast(token_counts,
465 mpu.get_model_parallel_src_rank(),
466 group=mpu.get_model_parallel_group())
467 args.vocab_size = token_counts[0].item()
468 args.eod_token = token_counts[1].item()
469 args.num_examples = token_counts[2].item()
470 args.num_original_tokens = token_counts[3].item()
471 args.num_tokenized_tokens = token_counts[4].item()
472
473 print('global rank: {} | vocab size: {} | eod token: {} | '
474 'num_examples: {} | num_original_tokens: {} | '
475 'num_tokenized_tokens: {}'.format(
476 torch.distributed.get_rank(), args.vocab_size,

Callers 1

mainFunction · 0.85

Calls 8

print_rank_0Function · 0.90
get_tokenizerFunction · 0.85
get_detokenizerFunction · 0.85
LM_Eval_DatasetClass · 0.85
decodeMethod · 0.80
get_commandMethod · 0.80
EncodeAsIdsMethod · 0.45

Tested by

no test coverage detected