(args)
| 217 | context_count += 1 |
| 218 | |
| 219 | def prepare_tokenizer(args): |
| 220 | |
| 221 | tokenizer_args = { |
| 222 | 'tokenizer_type': args.tokenizer_type, |
| 223 | 'corpus': None, |
| 224 | 'model_path': args.tokenizer_path, |
| 225 | 'vocab_size': args.vocab_size, |
| 226 | 'model_type': args.tokenizer_model_type, |
| 227 | 'cache_dir': args.cache_dir} |
| 228 | tokenizer = make_tokenizer(**tokenizer_args) |
| 229 | |
| 230 | args.tokenizer_num_tokens = tokenizer.num_tokens |
| 231 | args.tokenizer_num_type_tokens = tokenizer.num_type_tokens |
| 232 | args.eod_token = tokenizer.get_command('eos').Id |
| 233 | |
| 234 | after = tokenizer.num_tokens |
| 235 | while after % mpu.get_model_parallel_world_size() != 0: |
| 236 | after += 1 |
| 237 | |
| 238 | args.vocab_size = after |
| 239 | print("prepare tokenizer done", flush=True) |
| 240 | |
| 241 | return tokenizer |
| 242 | |
| 243 | def main(): |
| 244 | """Main training program.""" |
no test coverage detected