(tokenizer_name, dataset_cls)
| 122 | ], |
| 123 | ) |
| 124 | def test_math_hf_data_processor(tokenizer_name, dataset_cls): |
| 125 | # Initialize dataset |
| 126 | data = dataset_cls() |
| 127 | task_name = data.task_name |
| 128 | # Setup tokenizer |
| 129 | tokenizer = get_tokenizer( |
| 130 | TokenizerConfig( |
| 131 | name=tokenizer_name, |
| 132 | chat_template="default", |
| 133 | ) |
| 134 | ) |
| 135 | |
| 136 | # Configure task specification |
| 137 | math_task_spec = TaskDataSpec( |
| 138 | task_name=task_name, |
| 139 | prompt_file=f"{os.path.dirname(abspath)}/../../../examples/prompts/cot.txt", |
| 140 | system_prompt_file=None, |
| 141 | ) |
| 142 | |
| 143 | task_data_processors: dict[str, tuple[TaskDataSpec, TaskDataProcessFnCallable]] = ( |
| 144 | defaultdict(lambda: (math_task_spec, math_hf_data_processor)) |
| 145 | ) |
| 146 | task_data_processors[task_name] = (math_task_spec, math_hf_data_processor) |
| 147 | |
| 148 | dataset = AllTaskProcessedDataset( |
| 149 | dataset=data.dataset, |
| 150 | tokenizer=tokenizer, |
| 151 | default_task_data_spec=math_task_spec, |
| 152 | task_data_processors=task_data_processors, |
| 153 | max_seq_length=128, |
| 154 | ) |
| 155 | |
| 156 | # Test that the first item can be retrieved when the BOS token assertion passes |
| 157 | first_item = dataset[0] |
| 158 | assert first_item is not None |
| 159 | assert "message_log" in first_item |
| 160 | assert len(first_item["message_log"]) > 0 |
| 161 | |
| 162 | |
| 163 | def test_math_hf_data_processor_without_prompt(): |
nothing calls this directly
no test coverage detected