From 38f7c5584016e92ba1e0ee1b00ea6632740f67ce Mon Sep 17 00:00:00 2001 From: mingchuan Date: Sat, 13 May 2017 16:53:24 -0500 Subject: [PATCH] Fix RMSProp update rule (#6235) * Fix RMSProp update rule Follow the formula presents in Alex's paper, this prevents taking square root of a negative value (caused by arithmetic error). * Fix the formula of non centered version of RMSProp * Fix RMSProp update rule in python test * Fix RMSProp update rule in perl test --- perl-package/AI-MXNet/t/test_optimizers.t | 4 +-- src/operator/optimizer_op-inl.h | 38 ++++++++++++----------- tests/python/unittest/test_optimizer.py | 4 +-- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/perl-package/AI-MXNet/t/test_optimizers.t b/perl-package/AI-MXNet/t/test_optimizers.t index e0027b901cf8..c808e844a88d 100644 --- a/perl-package/AI-MXNet/t/test_optimizers.t +++ b/perl-package/AI-MXNet/t/test_optimizers.t @@ -166,7 +166,7 @@ method update($index, $weight, $grad, $state) $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient); } $n .= (1 - $self->gamma1) * ($grad * $grad) + $self->gamma1 * $n; - $weight -= $lr * $grad/(mx->nd->sqrt($n) + $self->epsilon); + $weight -= $lr * $grad/(mx->nd->sqrt($n + $self->epsilon)); } else { @@ -177,7 +177,7 @@ method update($index, $weight, $grad, $state) } $n .= (1 - $self->gamma1) * ($grad * $grad) + $self->gamma1 * $n; $g .= (1 - $self->gamma1) * $grad + $self->gamma1 * $g; - $delta .= ($self->gamma2) * $delta - $lr * $grad/(mx->nd->sqrt($n - $g*$g) + $self->epsilon); + $delta .= ($self->gamma2) * $delta - $lr * $grad/(mx->nd->sqrt($n - $g*$g + $self->epsilon)); $weight += $delta; } if($self->clip_weights) diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index be49d1319e3c..96f480bf8bc7 100755 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -300,8 +300,8 @@ inline void RMSPropAlexUpdate(const nnvm::NodeAttrs &attrs, delta = scalar(param.gamma2) * delta - scalar(param.lr) * (F(grad, DType(param.clip_gradient)) / - (F(state_n - state_g * state_g) + - scalar(param.epsilon))); + (F(state_n - state_g * state_g + + scalar(param.epsilon)))); } else { state_n = scalar(1.f - param.gamma1) * (grad * grad) + scalar(param.gamma1) * state_n; @@ -309,8 +309,8 @@ inline void RMSPropAlexUpdate(const nnvm::NodeAttrs &attrs, scalar(param.gamma1) * state_g; delta = scalar(param.gamma2) * delta - scalar(param.lr) * - (grad / (F(state_n - state_g * state_g) + - scalar(param.epsilon))); + (grad / (F(state_n - state_g * state_g + + scalar(param.epsilon)))); } if (param.clip_weights >= 0.0f) { @@ -386,17 +386,17 @@ inline void RMSPropUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx, if (param.clip_weights >= 0.0f) { Assign(out, req[0], F(weight - - scalar(param.lr) * - (F(grad, DType(param.clip_gradient)) / - (F(state_n) + - scalar(param.epsilon))), + scalar(param.lr) * + (F(grad, DType(param.clip_gradient)) / + (F(state_n + + scalar(param.epsilon)))), DType(param.clip_weights))); } else { Assign(out, req[0], weight - - scalar(param.lr) * - (F(grad, DType(param.clip_gradient)) / - (F(state_n) + - scalar(param.epsilon)))); + scalar(param.lr) * + (F(grad, DType(param.clip_gradient)) / + (F(state_n + + scalar(param.epsilon))))); } } else { state_n = scalar(1.f - param.gamma1) * (grad * grad) + @@ -404,15 +404,17 @@ inline void RMSPropUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx, if (param.clip_weights >= 0.0f) { Assign(out, req[0], F(weight - - scalar(param.lr) * - (grad / (F(state_n) + - scalar(param.epsilon))), + scalar(param.lr) * + (grad / + (F(state_n + + scalar(param.epsilon)))), DType(param.clip_weights))); } else { Assign(out, req[0], weight - - scalar(param.lr) * - (grad / (F(state_n) + - scalar(param.epsilon)))); + scalar(param.lr) * + (grad / + (F(state_n + + scalar(param.epsilon))))); } } }); diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index 554b6452a950..11ca7bed1743 100644 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -301,7 +301,7 @@ def update(self, index, weight, grad, state): if self.clip_gradient is not None: grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient) n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n - weight[:] -= lr * grad/(mx.nd.sqrt(n) + self.epsilon) + weight[:] -= lr * grad/(mx.nd.sqrt(n + self.epsilon)) else: n, g, delta = state @@ -309,7 +309,7 @@ def update(self, index, weight, grad, state): grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient) n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n g[:] = (1 - self.gamma1) * grad + self.gamma1 * g - delta[:] = (self.gamma2) * delta - lr * grad/(mx.nd.sqrt(n - g*g) + self.epsilon) + delta[:] = (self.gamma2) * delta - lr * grad/(mx.nd.sqrt(n - g*g + self.epsilon)) weight[:] += delta if self.clip_weights: