A simple tabular environment model that maintains the counts of each reward-outcome pair given the state and action that preceded them. The model can be queried with >>> M = EnvModel() >>> M[(state, action, reward, next_state)] += 1 >>> M[(state, action, reward,
| 26 | |
| 27 | |
| 28 | class EnvModel(object): |
| 29 | """ |
| 30 | A simple tabular environment model that maintains the counts of each |
| 31 | reward-outcome pair given the state and action that preceded them. The |
| 32 | model can be queried with |
| 33 | |
| 34 | >>> M = EnvModel() |
| 35 | >>> M[(state, action, reward, next_state)] += 1 |
| 36 | >>> M[(state, action, reward, next_state)] |
| 37 | 1 |
| 38 | >>> M.state_action_pairs() |
| 39 | [(state, action)] |
| 40 | >>> M.outcome_probs(state, action) |
| 41 | [(next_state, 1)] |
| 42 | """ |
| 43 | |
| 44 | def __init__(self): |
| 45 | super(EnvModel, self).__init__() |
| 46 | self._model = defaultdict(lambda: defaultdict(lambda: 0)) |
| 47 | |
| 48 | def __setitem__(self, key, value): |
| 49 | """Set self[key] to value""" |
| 50 | s, a, r, s_ = key |
| 51 | self._model[(s, a)][(r, s_)] = value |
| 52 | |
| 53 | def __getitem__(self, key): |
| 54 | """Return the value associated with key""" |
| 55 | s, a, r, s_ = key |
| 56 | return self._model[(s, a)][(r, s_)] |
| 57 | |
| 58 | def __contains__(self, key): |
| 59 | """True if EnvModel contains `key`, else False""" |
| 60 | s, a, r, s_ = key |
| 61 | p1 = (s, a) in self.state_action_pairs() |
| 62 | p2 = (r, s_) in self.reward_outcome_pairs() |
| 63 | return p1 and p2 |
| 64 | |
| 65 | def state_action_pairs(self): |
| 66 | """Return all (state, action) pairs in the environment model""" |
| 67 | return list(self._model.keys()) |
| 68 | |
| 69 | def reward_outcome_pairs(self, s, a): |
| 70 | """ |
| 71 | Return all (reward, next_state) pairs associated with taking action `a` |
| 72 | in state `s`. |
| 73 | """ |
| 74 | return list(self._model[(s, a)].keys()) |
| 75 | |
| 76 | def outcome_probs(self, s, a): |
| 77 | """ |
| 78 | Return the probability under the environment model of each outcome |
| 79 | state after taking action `a` in state `s`. |
| 80 | |
| 81 | Parameters |
| 82 | ---------- |
| 83 | s : int as returned by ``self._obs2num`` |
| 84 | The id for the state/observation. |
| 85 | a : int as returned by ``self._action2num`` |