MCPcopy
hub / github.com/policy-gradient/GRPO-Zero / answer_reward_function

Function answer_reward_function

countdown_task.py:113–145  ·  view source on GitHub ↗

Checks if the answer uses all numbers exactly once and evaluates to the target

(
    response: str, numbers: List[int] = None, target: int = None
)

Source from the content-addressed store, hash-verified

111
112
113def answer_reward_function(
114 response: str, numbers: List[int] = None, target: int = None
115) -> float:
116 """
117 Checks if the answer uses all numbers exactly once and evaluates to the target
118 """
119 answer_regex = r"<answer>(.*?)<\/answer>"
120 answer_match = re.search(answer_regex, response, re.DOTALL)
121 if not answer_match:
122 return 0.0
123
124 answer_content = answer_match.group(1)
125 if not answer_content:
126 return 0.0
127
128 allowed_chars = r"^[0-9+\-*/() ]+$"
129 if not re.match(allowed_chars, answer_content):
130 return 0.0
131
132 # Check if the answer uses all numbers exactly once
133 used_numbers = [int(n) for n in re.findall(r"\d+", answer_content)]
134 if sorted(used_numbers) != sorted(numbers):
135 return 0.0
136
137 # Check if the answer evaluates to the target
138 try:
139 result = eval(answer_content, {"__builtins__": None}, {})
140 if abs(float(result) - float(target)) < 1e-5:
141 return 1.0
142 except:
143 pass
144
145 return 0.0
146
147
148def reward_function(

Callers 1

reward_functionFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected