(examples)
| 188 | |
| 189 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. |
| 190 | def group_texts(examples): |
| 191 | # Concatenate all texts. |
| 192 | concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} |
| 193 | total_length = len(concatenated_examples[list(examples.keys())[0]]) |
| 194 | # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can |
| 195 | # customize this part to your needs. |
| 196 | if total_length >= block_size: |
| 197 | total_length = (total_length // block_size) * block_size |
| 198 | # Split by chunks of max_len. |
| 199 | result = { |
| 200 | k: [t[i : i + block_size] for i in range(0, total_length, block_size)] |
| 201 | for k, t in concatenated_examples.items() |
| 202 | } |
| 203 | labels = result["input_ids"].copy() |
| 204 | mask_user_labels(tokenizer, dialogue_template, labels) |
| 205 | result["labels"] = labels |
| 206 | return result |
| 207 | |
| 208 | # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder |
| 209 | # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower |
nothing calls this directly
no test coverage detected