MCPcopy Index your code
hub / github.com/NVIDIA-NeMo/RL / test_math_hf_data_processor

Function test_math_hf_data_processor

tests/unit/data/test_data_processor.py:124–160  ·  view source on GitHub ↗
(tokenizer_name, dataset_cls)

Source from the content-addressed store, hash-verified

122 ],
123)
124def test_math_hf_data_processor(tokenizer_name, dataset_cls):
125 # Initialize dataset
126 data = dataset_cls()
127 task_name = data.task_name
128 # Setup tokenizer
129 tokenizer = get_tokenizer(
130 TokenizerConfig(
131 name=tokenizer_name,
132 chat_template="default",
133 )
134 )
135
136 # Configure task specification
137 math_task_spec = TaskDataSpec(
138 task_name=task_name,
139 prompt_file=f"{os.path.dirname(abspath)}/../../../examples/prompts/cot.txt",
140 system_prompt_file=None,
141 )
142
143 task_data_processors: dict[str, tuple[TaskDataSpec, TaskDataProcessFnCallable]] = (
144 defaultdict(lambda: (math_task_spec, math_hf_data_processor))
145 )
146 task_data_processors[task_name] = (math_task_spec, math_hf_data_processor)
147
148 dataset = AllTaskProcessedDataset(
149 dataset=data.dataset,
150 tokenizer=tokenizer,
151 default_task_data_spec=math_task_spec,
152 task_data_processors=task_data_processors,
153 max_seq_length=128,
154 )
155
156 # Test that the first item can be retrieved when the BOS token assertion passes
157 first_item = dataset[0]
158 assert first_item is not None
159 assert "message_log" in first_item
160 assert len(first_item["message_log"]) > 0
161
162
163def test_math_hf_data_processor_without_prompt():

Callers

nothing calls this directly

Calls 4

get_tokenizerFunction · 0.90
TokenizerConfigClass · 0.90
TaskDataSpecClass · 0.90

Tested by

no test coverage detected