()
| 83 | return tokenizer |
| 84 | |
| 85 | def main(): |
| 86 | args = parse_args() |
| 87 | set_random_seed(args.seed) |
| 88 | tokenizer = load_telechat_tokenizer(args.tokenizer_path, fast_tokenizer=True) |
| 89 | args.user_token_id = tokenizer.convert_tokens_to_ids(args.user_token) |
| 90 | args.bot_token_id = tokenizer.convert_tokens_to_ids(args.bot_token) |
| 91 | args.end_token_id = tokenizer.convert_tokens_to_ids(args.end_token) |
| 92 | |
| 93 | create_prompt_dataset( |
| 94 | args.data_path, |
| 95 | args.data_output_path, |
| 96 | args.seed, |
| 97 | tokenizer, |
| 98 | args.max_seq_len, |
| 99 | args.num_workers, |
| 100 | args.num_samples, |
| 101 | args.process_method, |
| 102 | args) |
| 103 | |
| 104 | print("Finish processing data!") |
| 105 | |
| 106 | |
| 107 | if __name__ == "__main__": |
no test coverage detected