MCPcopy Index your code
hub / github.com/THUDM/GLM / prepare_tokenizer

Function prepare_tokenizer

configure_data.py:120–151  ·  view source on GitHub ↗
(args)

Source from the content-addressed store, hash-verified

118
119
120def prepare_tokenizer(args):
121 add_sentinel_token = 0
122 if args.sentinel_token:
123 add_sentinel_token = args.max_position_embeddings
124 tokenizer = make_tokenizer(args.tokenizer_type, None, args.tokenizer_path, args.vocab_size,
125 args.tokenizer_model_type, add_block_symbols=args.block_lm, cache_dir=args.cache_dir,
126 add_sentinel_token=add_sentinel_token, add_task_mask=args.task_mask,
127 add_decoder_mask=args.block_mask_prob > 0.0 or args.context_mask_ratio > 0.0,
128 fix_command_token=args.fix_command_token)
129 if mpu.get_model_parallel_rank() == 0:
130 num_tokens = tokenizer.num_tokens
131 eod_token = tokenizer.get_command('eos').Id
132 assert eod_token == tokenizer.get_command('pad').Id
133 before = num_tokens
134 after = before
135 multiple = args.make_vocab_size_divisible_by
136 while (after % multiple) != 0:
137 after += 1
138 print_rank_0('> padded vocab (size: {}) with {} dummy '
139 'tokens (new size: {})'.format(before, after - before, after))
140 print_rank_0('> found end-of-document token: {}'.format(eod_token))
141 token_counts = torch.cuda.LongTensor([after, eod_token])
142 else:
143 token_counts = torch.cuda.LongTensor([0, 0])
144 # Broadcast num tokens.
145 torch.distributed.broadcast(token_counts,
146 mpu.get_model_parallel_src_rank(),
147 group=mpu.get_model_parallel_group())
148 num_tokens = token_counts[0].item()
149 eod_token = token_counts[1].item()
150 args.vocab_size, args.eod_token = num_tokens, eod_token
151 return tokenizer
152
153
154def make_data_loader(dataset, tokenizer, batch_size, num_iters, args, shuffle=False, block_collate=False):

Callers 3

finetuneFunction · 0.90
mainFunction · 0.90
mainFunction · 0.90

Calls 3

make_tokenizerFunction · 0.90
print_rank_0Function · 0.90
get_commandMethod · 0.80

Tested by

no test coverage detected