Estimate the average number of characters per token in the dataset.
(dataset, tokenizer, input_column_name="prompt", output_column_name="completion", nb_examples=400)
| 91 | |
| 92 | |
| 93 | def chars_token_ratio(dataset, tokenizer, input_column_name="prompt", output_column_name="completion", nb_examples=400): |
| 94 | """ |
| 95 | Estimate the average number of characters per token in the dataset. |
| 96 | """ |
| 97 | total_characters, total_tokens = 0, 0 |
| 98 | for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples): |
| 99 | text = prepare_sample_text(example, input_column_name, output_column_name) |
| 100 | total_characters += len(text) |
| 101 | if tokenizer.is_fast: |
| 102 | total_tokens += len(tokenizer(text).tokens()) |
| 103 | else: |
| 104 | total_tokens += len(tokenizer.tokenize(text)) |
| 105 | |
| 106 | return total_characters / total_tokens |
| 107 | |
| 108 | |
| 109 | def print_trainable_parameters(model): |
no test coverage detected