MCPcopy
hub / github.com/karpathy/neuraltalk / step

Method step

imagernn/solver.py:15–87  ·  view source on GitHub ↗

perform a single batch update. Takes as input: - batch of data (X) - model (W) - cost function which takes batch, model

(self, batch, model, cost_function, **kwargs)

Source from the content-addressed store, hash-verified

13 self.step_cache2_ = {} # might need this
14
15 def step(self, batch, model, cost_function, **kwargs):
16 """
17 perform a single batch update. Takes as input:
18 - batch of data (X)
19 - model (W)
20 - cost function which takes batch, model
21 """
22
23 learning_rate = kwargs.get('learning_rate', 0.0)
24 update = kwargs.get('update', model.keys())
25 grad_clip = kwargs.get('grad_clip', -1)
26 solver = kwargs.get('solver', 'vanilla')
27 momentum = kwargs.get('momentum', 0)
28 smooth_eps = kwargs.get('smooth_eps', 1e-8)
29 decay_rate = kwargs.get('decay_rate', 0.999)
30
31 if not (solver == 'vanilla' and momentum == 0):
32 # lazily make sure we initialize step cache if needed
33 for u in update:
34 if not u in self.step_cache_:
35 self.step_cache_[u] = np.zeros(model[u].shape)
36 if solver == 'adadelta':
37 self.step_cache2_[u] = np.zeros(model[u].shape) # adadelta needs one more cache
38
39 # compute cost and gradient
40 cg = cost_function(batch, model)
41 cost = cg['cost']
42 grads = cg['grad']
43 stats = cg['stats']
44
45 # clip gradients if needed, simplest possible version
46 # todo later: maybe implement the gradient direction conserving version
47 if grad_clip > 0:
48 for p in update:
49 if p in grads:
50 grads[p] = np.minimum(grads[p], grad_clip)
51 grads[p] = np.maximum(grads[p], -grad_clip)
52
53 # perform parameter update
54 for p in update:
55 if p in grads:
56
57 if solver == 'vanilla': # vanilla sgd, optional with momentum
58 if momentum > 0:
59 dx = momentum * self.step_cache_[p] - learning_rate * grads[p]
60 self.step_cache_[p] = dx
61 else:
62 dx = - learning_rate * grads[p]
63
64 elif solver == 'rmsprop':
65 self.step_cache_[p] = self.step_cache_[p] * decay_rate + (1.0 - decay_rate) * grads[p] ** 2
66 dx = -(learning_rate * grads[p]) / np.sqrt(self.step_cache_[p] + smooth_eps)
67
68 elif solver == 'adagrad':
69 self.step_cache_[p] += grads[p] ** 2
70 dx = -(learning_rate * grads[p]) / np.sqrt(self.step_cache_[p] + smooth_eps)
71
72 elif solver == 'adadelta':

Callers 1

mainFunction · 0.95

Calls

no outgoing calls

Tested by

no test coverage detected