(params)
| 176 | |
| 177 | |
| 178 | def reward_function(params): |
| 179 | model = ANN(D, M, K) |
| 180 | model.set_params(params) |
| 181 | |
| 182 | # play one episode and return the total reward |
| 183 | episode_reward = 0 |
| 184 | episode_length = 0 # not sure if it will be used |
| 185 | done = False |
| 186 | obs = env.reset() |
| 187 | obs_dim = len(obs) |
| 188 | if HISTORY_LENGTH > 1: |
| 189 | state = np.zeros(HISTORY_LENGTH*obs_dim) # current state |
| 190 | state[-obs_dim:] = obs |
| 191 | else: |
| 192 | state = obs |
| 193 | while not done: |
| 194 | # get the action |
| 195 | action = model.sample_action(state) |
| 196 | |
| 197 | # perform the action |
| 198 | obs, reward, done = env.step(action) |
| 199 | |
| 200 | # update total reward |
| 201 | episode_reward += reward |
| 202 | episode_length += 1 |
| 203 | |
| 204 | # update state |
| 205 | if HISTORY_LENGTH > 1: |
| 206 | state = np.roll(state, -obs_dim) |
| 207 | state[-obs_dim:] = obs |
| 208 | else: |
| 209 | state = obs |
| 210 | return episode_reward |
| 211 | |
| 212 | |
| 213 | if __name__ == '__main__': |
no test coverage detected