| 180 | |
| 181 | |
| 182 | class Adam(Optimizer): |
| 183 | def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8): |
| 184 | self.epsilon = epsilon |
| 185 | self.beta_2 = beta_2 |
| 186 | self.beta_1 = beta_1 |
| 187 | self.lr = learning_rate |
| 188 | self.iterations = 0 |
| 189 | self.t = 1 |
| 190 | |
| 191 | def update(self, network): |
| 192 | for i, layer in enumerate(network.parametric_layers): |
| 193 | for n in layer.parameters.keys(): |
| 194 | grad = layer.parameters.grad[n] |
| 195 | self.ms[i][n] = (self.beta_1 * self.ms[i][n]) + ( |
| 196 | 1.0 - self.beta_1 |
| 197 | ) * grad |
| 198 | self.vs[i][n] = (self.beta_2 * self.vs[i][n]) + ( |
| 199 | 1.0 - self.beta_2 |
| 200 | ) * grad**2 |
| 201 | lr = ( |
| 202 | self.lr |
| 203 | * np.sqrt(1.0 - self.beta_2**self.t) |
| 204 | / (1.0 - self.beta_1**self.t) |
| 205 | ) |
| 206 | |
| 207 | step = lr * self.ms[i][n] / (np.sqrt(self.vs[i][n]) + self.epsilon) |
| 208 | layer.parameters.step(n, -step) |
| 209 | self.t += 1 |
| 210 | |
| 211 | def setup(self, network): |
| 212 | # Accumulators |
| 213 | self.ms = defaultdict(dict) |
| 214 | self.vs = defaultdict(dict) |
| 215 | for i, layer in enumerate(network.parametric_layers): |
| 216 | for n in layer.parameters.keys(): |
| 217 | self.ms[i][n] = np.zeros_like(layer.parameters[n]) |
| 218 | self.vs[i][n] = np.zeros_like(layer.parameters[n]) |
| 219 | |
| 220 | |
| 221 | class Adamax(Optimizer): |
no outgoing calls