Source code for slugnet.optimizers
import numpy as np
from slugnet.initializations import _zero
class Optimizer(object):
def __init__(self, lr=0.001, clip=-1, decay=0., lr_min=0., lr_max=np.inf):
self.lr = lr
self.clip = clip
self.decay = decay
self.lr_min = lr_min
self.lr_max = lr_max
self.iterations = 0
def update(self, params, grads):
"""Update parameters.
Parameters
----------
params : list
A list of parameters in model.
grads : list
A list of gradients in model.
"""
self.iterations += 1
self.lr *= (1. / 1 + self.decay * self.iterations)
self.lr = np.clip(self.lr, self.lr_min, self.lr_max)
def __str__(self):
return self.__class__.__name__
[docs]class SGD(Optimizer):
"""
Optimize model parameters using common stochastic gradient descent.
"""
[docs] def update(self, params, grads):
for p, g in zip(params, grads):
p -= self.lr * npdl_clip(g, self.clip)
super(SGD, self).update(params, grads)
def npdl_clip(grad, boundary):
if boundary > 0:
return np.clip(grad, -boundary, boundary)
else:
return grad
[docs]class RMSProp(Optimizer):
"""RMSProp updates
Scale learning rates by dividing with the moving average of the root mean
squared (RMS) gradients. See [1]_ for further description.
:param rho: Gradient moving average decay factor.
:type rho: float
:param epsilon: Small value added for numerical stability.
:type epsilon: float
`rho` should be between 0 and 1. A value of `rho` close to 1 will decay the
moving average slowly and a value close to 0 will decay the moving average
fast.
Using the step size :math:`\\eta` and a decay factor :math:`\\rho` the
learning rate :math:`\\eta_t` is calculated as:
.. math::
r_t &= \\rho r_{t-1} + (1-\\rho)*g^2\\\\
\\eta_t &= \\frac{\\eta}{\\sqrt{r_t + \\epsilon}}
References
----------
.. [1] Tieleman, T. and Hinton, G. (2012):
Neural Networks for Machine Learning, Lecture 6.5 - rmsprop.
Coursera. http://www.youtube.com/watch?v=O3sxAc4hxZU (formula @5:20)
"""
def __init__(self, rho=0.9, epsilon=1e-6, *args, **kwargs):
super(RMSProp, self).__init__(*args, **kwargs)
self.rho = rho
self.epsilon = epsilon
self.cache = None
self.iterations = 0
[docs] def update(self, params, grads):
# init cache
if self.cache is None:
self.cache = [_zero(p.shape) for p in params]
# update parameters
for i, (c, p, g) in enumerate(zip(self.cache, params, grads)):
c = self.rho * c + (1 - self.rho) * np.power(g, 2)
p -= (self.lr * g / np.sqrt(c + self.epsilon))
self.cache[i] = c