test_learning_rate.py

"""
Comparing adagrad, adadelta and constant learning in gradient descent(the seddle point function y^2 - x^2)

Reference:
1. comparison on several learning rate update scheme: http://ml.memect.com/archive/2014-12-12/short.html#3786866375172817
2. Saddle point, http://en.wikipedia.org/wiki/Saddle_point

"""
import numpy as np
import theano
import theano.tensor as T

rho = 0.95
epsilon = 0.00001
gamma = 0.1

const_lr = 0.01

init_x = [0.1, 0.1]
x = theano.shared(
    np.array(init_x, dtype = theano.config.floatX), 
    borrow = True,
    name = "x"
)

tolorate = 0.01

params = [x]
param_shapes = [(2,)]

# cost = 0.5 * (x[0]-2) ** 2 + (x[1]-2) ** 2
cost = x[0] ** 2 - x[1] ** 2

param_grads = [T.grad(cost, param) for param in params]

def make_func(x, cost, updates, init_x):
    x.set_value(init_x)
    f = theano.function(
        inputs = [], 
        outputs = [x, cost], 
        updates = updates
    )
    return f

def simulate(f, n_epoch_max = 100):
    epoch = 0
    used_epochs = 0
    xs = []
    print "##################"
    while epoch < n_epoch_max:
        x_val, cost_val = f()
        xs.append(x_val)
        # if abs(cost_val) < tolorate:
        #     break
        epoch += 1
        used_epochs += 1
    return xs, used_epochs


###############
# ADADELTA    #
###############
print "Using AdaDelta with rho = %f and epsilon = %f" %(rho, epsilon)
egs = [
    theano.shared(
        value = np.zeros(param_shape,
                         dtype = theano.config.floatX
                     ),
        borrow = True,
        name = "Eg:" + param.name
    )
    for param_shape, param in zip(param_shapes, params)
]

exs = [
    theano.shared(
        value = np.zeros(param_shape,
                         dtype = theano.config.floatX
                     ),
        borrow = True,        
        name = "Ex:" + param.name
    )
    for param_shape, param in zip(param_shapes, params)
]

new_egs = [
    rho * eg + (1 - rho) * g ** 2
    for eg, g in zip(egs, param_grads)
]

delta_x = [
    -(T.sqrt(ex + epsilon) / T.sqrt(new_eg + epsilon)) * g
    for new_eg, ex, g in zip(new_egs, exs, param_grads)
]
new_exs = [
    rho * ex + (1 - rho) * (dx ** 2)
    for ex, dx in zip(exs, delta_x)
]

egs_updates = zip(egs, new_egs)
exs_updates = zip(exs, new_exs)
param_updates = [
    (p, p + dx)
    for dx, g, p in zip(delta_x, param_grads, params)
]

updates = egs_updates + exs_updates + param_updates

f = make_func(x, cost, updates, init_x)
adadelta_xs, adadelta_epochs = simulate(f)

##############
# ADAGRAD    #
##############
print "Using AdaGrad with gamma = %f and epsilon = %f" %(gamma, epsilon)
grad_hists = [
            theano.shared(
                value = np.zeros(param_shape,
                                 dtype = theano.config.floatX
                             ),
                borrow = True,        
                name = "grad_hist:" + param.name
            )
            for param_shape, param in zip(param_shapes, params)
        ]
        
new_grad_hists = [
    g_hist + g ** 2
    for g_hist, g in zip(grad_hists, param_grads)
]

param_updates = [
    (param, param - theano.printing.Print("lr")(gamma * epsilon / (T.sqrt(g_hist) + epsilon)) * param_grad)
    for param, param_grad in zip(params, param_grads)
]

grad_hist_update = zip(grad_hists, new_grad_hists)

updates = grad_hist_update + param_updates

f = make_func(x, cost, updates, init_x)
adagrad_xs, adagrad_epochs = simulate(f)

###############
# constant lr #
###############
print "Usin constant learning rate %f" %(const_lr)

updates = [
    (param, param - const_lr * param_grad)
    for param, param_grad in zip(params, param_grads)
]

f = make_func(x, cost, updates, init_x)
const_lr_xs, const_lr_epochs = simulate(f)

from matplotlib import pyplot  as plt

def myplot(data, style, title, plot_number, total):
    plt.subplot(1,total,plot_number)
    x, y = zip(*data)
    plt.plot(x, y, 'ro-')
    plt.title(title)
    plt.xlim([-10, 10]); plt.ylim([-10, 10])

myplot(adadelta_xs, 
       'ro-', 
       "AdaDelta(%d epochs)" %(adadelta_epochs), 
       1, 3)

myplot(adagrad_xs, 
       'ro-', 
       "AdaGrad(%d epochs)" %(adagrad_epochs), 
       2, 3)

myplot(const_lr_xs, 
       'ro-', 
       "ConstLR(%d epochs)" %(const_lr_epochs), 
       3, 3)

plt.show()