hub / github.com/NVIDIA-NeMo/RL / test_math_hf_data_processor

Function test_math_hf_data_processor

tests/unit/data/test_data_processor.py:124–160 · view source on GitHub ↗

(tokenizer_name, dataset_cls)

Source from the content-addressed store, hash-verified

122	],
123	)
124	def test_math_hf_data_processor(tokenizer_name, dataset_cls):
125	# Initialize dataset
126	data = dataset_cls()
127	task_name = data.task_name
128	# Setup tokenizer
129	tokenizer = get_tokenizer(
130	TokenizerConfig(
131	name=tokenizer_name,
132	chat_template="default",
133	)
134	)
135
136	# Configure task specification
137	math_task_spec = TaskDataSpec(
138	task_name=task_name,
139	prompt_file=f"{os.path.dirname(abspath)}/../../../examples/prompts/cot.txt",
140	system_prompt_file=None,
141	)
142
143	task_data_processors: dict[str, tuple[TaskDataSpec, TaskDataProcessFnCallable]] = (
144	defaultdict(lambda: (math_task_spec, math_hf_data_processor))
145	)
146	task_data_processors[task_name] = (math_task_spec, math_hf_data_processor)
147
148	dataset = AllTaskProcessedDataset(
149	dataset=data.dataset,
150	tokenizer=tokenizer,
151	default_task_data_spec=math_task_spec,
152	task_data_processors=task_data_processors,
153	max_seq_length=128,
154	)
155
156	# Test that the first item can be retrieved when the BOS token assertion passes
157	first_item = dataset[0]
158	assert first_item is not None
159	assert "message_log" in first_item
160	assert len(first_item["message_log"]) > 0
161
162
163	def test_math_hf_data_processor_without_prompt():

nothing calls this directly

get_tokenizerFunction · 0.90

TokenizerConfigClass · 0.90

TaskDataSpecClass · 0.90

AllTaskProcessedDatasetClass · 0.90

no test coverage detected