From 9c96c40833baa337f5107d5a35db2b295530d5c1 Mon Sep 17 00:00:00 2001
From: Rahul <rahulhuilgol@gmail.com>
Date: Tue, 7 Aug 2018 00:35:31 -0700
Subject: [PATCH 1/3] Add warmup and fix inconsistencies with learning rate
 schedulers

---
 example/image-classification/common/fit.py |   3 +-
 python/mxnet/lr_scheduler.py               | 105 +++++++++++++++++----
 tests/python/unittest/test_optimizer.py    |  49 ++++++++++
 3 files changed, 140 insertions(+), 17 deletions(-)

diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py
index 3f37ad3ac591..dc983ea6f8d4 100755
--- a/example/image-classification/common/fit.py
+++ b/example/image-classification/common/fit.py
@@ -49,7 +49,8 @@ def _get_lr_scheduler(args, kv):
     steps = [epoch_size * (x - begin_epoch)
              for x in step_epochs if x - begin_epoch > 0]
     if steps:
-        return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor))
+        return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor,
+                                                         base_lr=args.lr))
     else:
         return (lr, None)
 
diff --git a/python/mxnet/lr_scheduler.py b/python/mxnet/lr_scheduler.py
index 963560d17853..51e0bfffbbcc 100644
--- a/python/mxnet/lr_scheduler.py
+++ b/python/mxnet/lr_scheduler.py
@@ -17,6 +17,7 @@
 
 """Scheduling learning rate."""
 import logging
+from math import cos, pi
 
 class LRScheduler(object):
     """Base class of a learning rate scheduler.
@@ -29,8 +30,31 @@ class LRScheduler(object):
     base_lr : float, optional
         The initial learning rate.
     """
-    def __init__(self, base_lr=0.01):
+    def __init__(self, base_lr=0.01, warmup_steps=0, warmup_begin_lr=0, warmup_mode='linear'):
         self.base_lr = base_lr
+        assert isinstance(warmup_steps, int)
+        self.warmup_steps = warmup_steps
+
+        self.warmup_final_lr = base_lr
+        self.warmup_begin_lr = warmup_begin_lr
+        if self.warmup_begin_lr > self.warmup_final_lr:
+            raise ValueError("Base lr has to be higher than warmup_begin_lr")
+        if self.warmup_steps < 0:
+            raise ValueError("Warmup steps has to be positive or 0")
+        if warmup_mode not in ['linear', 'constant']:
+            raise ValueError("Supports only linear and constant modes of warmup")
+        self.warmup_mode = warmup_mode
+
+    def get_warmup_lr(self, num_update):
+        assert num_update < self.warmup_steps
+        if self.warmup_mode == 'linear':
+            increase = (self.warmup_final_lr - self.warmup_begin_lr) \
+                       * float(num_update)/float(self.warmup_steps)
+            return self.warmup_begin_lr + increase
+        elif self.warmup_mode == 'constant':
+            return self.warmup_begin_lr
+        else:
+            raise ValueError("Invalid warmup mode %s"%self.warmup_mode)
 
     def __call__(self, num_update):
         """Return a new learning rate.
@@ -66,8 +90,9 @@ class FactorScheduler(LRScheduler):
     stop_factor_lr : float, optional
         Stop updating the learning rate if it is less than this value.
     """
-    def __init__(self, step, factor=1, stop_factor_lr=1e-8):
-        super(FactorScheduler, self).__init__()
+    def __init__(self, step, factor=1, stop_factor_lr=1e-8, base_lr=0.01,
+                 warmup_steps=0, warmup_begin_lr=0, warmup_mode='linear'):
+        super(FactorScheduler, self).__init__(base_lr, warmup_steps, warmup_begin_lr, warmup_mode)
         if step < 1:
             raise ValueError("Schedule step must be greater or equal than 1 round")
         if factor > 1.0:
@@ -78,6 +103,9 @@ def __init__(self, step, factor=1, stop_factor_lr=1e-8):
         self.count = 0
 
     def __call__(self, num_update):
+        if num_update < self.warmup_steps:
+            return self.get_warmup_lr(num_update)
+
         # NOTE: use while rather than if  (for continuing training via load_epoch)
         while num_update > self.count + self.step:
             self.count += self.step
@@ -109,8 +137,10 @@ class MultiFactorScheduler(LRScheduler):
     factor: float
         The factor to change the learning rate.
     """
-    def __init__(self, step, factor=1):
-        super(MultiFactorScheduler, self).__init__()
+    def __init__(self, step, factor=1, base_lr=0.01, warmup_steps=0, warmup_begin_lr=0,
+                 warmup_mode='linear'):
+        super(MultiFactorScheduler, self).__init__(base_lr, warmup_steps,
+                                                   warmup_begin_lr, warmup_mode)
         assert isinstance(step, list) and len(step) >= 1
         for i, _step in enumerate(step):
             if i != 0 and step[i] <= step[i-1]:
@@ -125,6 +155,9 @@ def __init__(self, step, factor=1):
         self.count = 0
 
     def __call__(self, num_update):
+        if num_update < self.warmup_steps:
+            return self.get_warmup_lr(num_update)
+
         # NOTE: use while rather than if  (for continuing training via load_epoch)
         while self.cur_step_ind <= len(self.step)-1:
             if num_update > self.step[self.cur_step_ind]:
@@ -138,33 +171,73 @@ def __call__(self, num_update):
         return self.base_lr
 
 class PolyScheduler(LRScheduler):
+    """ Reduce the learning rate according to a polynomial of given power.
+
+    Calculate the new learning rate by::
+
+       final_lr + (start_lr - final_lr) * (1-nup/max_nup)^pwr
+       if nup < max_nup, 0 otherwise.
+
+    Parameters
+    ----------
+       max_update: maximum number of updates before the decay reaches final learning rate.
+       base_lr:    base learning rate to start from
+       pwr:   power of the decay term as a function of the current number of updates.
+       final_lr:   final learning rate after all steps
+       warmup_steps: number of warmup steps used before this scheduler starts decay
+    """
+
+    def __init__(self, max_update, base_lr=0.01, pwr=2, final_lr=0,
+                 warmup_steps=0, warmup_begin_lr=0, warmup_mode='linear'):
+        super(PolyScheduler, self).__init__(base_lr, warmup_steps, warmup_begin_lr, warmup_mode)
+        assert isinstance(max_update, int)
+        if max_update < 1:
+            raise ValueError("maximum number of updates must be strictly positive")
+        self.power = pwr
+        self.base_lr_orig = self.base_lr
+        self.max_update = max_update
+        self.final_lr = final_lr
+        self.max_steps = self.max_update - self.warmup_steps
+
+    def __call__(self, num_update):
+        if num_update < self.warmup_steps:
+            return self.get_warmup_lr(num_update)
+        if num_update <= self.max_update:
+            self.base_lr = self.final_lr + (self.base_lr_orig - self.final_lr) * \
+                pow(1 - float(num_update - self.warmup_steps) / float(self.max_steps), self.power)
+        return self.base_lr
+
+class CosineScheduler(LRScheduler):
     """ Reduce the learning rate by given a list of steps.
 
     Calculate the new learning rate by::
 
-       base_lr * (1-nup/max_nup)^pwr
+       final_lr + (start_lr - final_lr) * (1+cos(pi * nup/max_nup))/2
        if nup < max_nup, 0 otherwise.
 
     Parameters
     ----------
-       max_update: maximum number of updates before the decay reaches 0.
+       max_update: maximum number of updates before the decay reaches 0
        base_lr:    base learning rate
-       pwr:   power of the decay term as a funtion of the current number of updates.
-
+       final_lr:   final learning rate after all steps
+       warmup_steps: number of warmup steps used before this scheduler starts decay
     """
 
-    def __init__(self, max_update, base_lr=0.01, pwr=2):
-        super(PolyScheduler, self).__init__(base_lr)
+    def __init__(self, max_update, base_lr=0.01, final_lr=0,
+                 warmup_steps=0, warmup_begin_lr=0, warmup_mode='linear'):
+        super(CosineScheduler, self).__init__(base_lr, warmup_steps, warmup_begin_lr, warmup_mode)
         assert isinstance(max_update, int)
         if max_update < 1:
             raise ValueError("maximum number of updates must be strictly positive")
-        self.base_lr_orig = self.base_lr
+        self.base_lr_orig = base_lr
         self.max_update = max_update
-        self.power = pwr
-        self.base_lr = self.base_lr_orig
+        self.final_lr = final_lr
+        self.max_steps = self.max_update - self.warmup_steps
 
     def __call__(self, num_update):
+        if num_update < self.warmup_steps:
+            return self.get_warmup_lr(num_update)
         if num_update <= self.max_update:
-            self.base_lr = self.base_lr_orig * pow(1.0 - float(num_update) / float(self.max_update),
-                                                   self.power)
+            self.base_lr = self.final_lr + (self.base_lr_orig - self.final_lr) * \
+                (1 + cos(pi * (num_update - self.warmup_steps) / self.max_steps)) / 2
         return self.base_lr
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 90762f7620ff..b0658de6b690 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -1033,6 +1033,55 @@ def test_adagrad():
                                               w_stype='row_sparse', g_stype='row_sparse')
 
 
+def test_factor_scheduler():
+    base_lr = 1
+    step = 100
+    factor = 0.1
+    sched = mx.lr_scheduler.FactorScheduler(step, factor, stop_factor_lr=1e-4, base_lr=base_lr,
+                                        warmup_steps=20, warmup_begin_lr=0.1, warmup_mode='constant')
+    assert (sched(0) == 0.1)
+    np.testing.assert_almost_equal(sched(10), 0.1)
+    assert (sched(21) == base_lr), sched(21)
+    np.testing.assert_almost_equal(sched(101), base_lr * factor)
+    np.testing.assert_almost_equal(sched(201), base_lr * factor * factor)
+    np.testing.assert_almost_equal(sched(1000), 1e-4)
+
+def test_multifactor_scheduler():
+    base_lr = 0.1
+    steps = [15, 25]
+    factor = 0.1
+    sched = mx.lr_scheduler.MultiFactorScheduler(steps, factor, base_lr=base_lr,
+                                        warmup_steps=10, warmup_begin_lr=0.05, warmup_mode='linear')
+    assert sched(0) == 0.05
+    np.testing.assert_almost_equal(sched(5), 0.05 + (base_lr - 0.05)/2)
+    np.testing.assert_almost_equal(sched(15), base_lr)
+    np.testing.assert_almost_equal(sched(16), base_lr * factor)
+    np.testing.assert_almost_equal(sched(20), base_lr * factor)
+    np.testing.assert_almost_equal(sched(26), base_lr * factor * factor)
+    np.testing.assert_almost_equal(sched(100), base_lr * factor * factor)
+
+def test_poly_scheduler():
+    base_lr = 3
+    final_lr = 0
+    steps = 1000
+    poly_sched = mx.lr_scheduler.PolyScheduler(steps, base_lr=base_lr, pwr=2, final_lr=final_lr,
+                                    warmup_steps=100, warmup_begin_lr=0, warmup_mode='linear')
+    np.testing.assert_almost_equal(poly_sched(0), 0)
+    np.testing.assert_almost_equal(poly_sched(50), float(base_lr)/2)
+    np.testing.assert_almost_equal(poly_sched(100), base_lr)
+    assert (poly_sched(101) <  poly_sched(100))
+    assert (poly_sched(500) < 1.6)
+    np.testing.assert_almost_equal(poly_sched(steps), final_lr)
+
+def test_cosine_scheduler():
+    # also tests case without warmup
+    base_lr = 3
+    final_lr = 0.1
+    steps = 1000
+    cosine_sched = mx.lr_scheduler.CosineScheduler(steps, base_lr=base_lr, final_lr=final_lr)
+    np.testing.assert_almost_equal(cosine_sched(0), base_lr)
+    np.testing.assert_almost_equal(cosine_sched(steps), final_lr)
+    assert (cosine_sched(500) > 1.5)
 
 if __name__ == '__main__':
     import nose

From 44a5af4277ddc3a1213ced06ac9439606a1e1032 Mon Sep 17 00:00:00 2001
From: Rahul <rahulhuilgol@gmail.com>
Date: Fri, 17 Aug 2018 15:02:07 -0700
Subject: [PATCH 2/3] add comments

---
 python/mxnet/lr_scheduler.py            | 64 ++++++++++++++++++++-----
 tests/python/unittest/test_optimizer.py |  6 ++-
 2 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/python/mxnet/lr_scheduler.py b/python/mxnet/lr_scheduler.py
index 51e0bfffbbcc..436085620a2e 100644
--- a/python/mxnet/lr_scheduler.py
+++ b/python/mxnet/lr_scheduler.py
@@ -29,8 +29,17 @@ class LRScheduler(object):
     ----------
     base_lr : float, optional
         The initial learning rate.
+    warmup_steps: int
+        number of warmup steps used before this scheduler starts decay
+    warmup_begin_lr: float
+        if using warmup, the learning rate from which it starts warming up
+    warmup_mode: string
+        warmup can be done in two modes.
+        'linear' mode gradually increases lr with each step in equal increments
+        'constant' mode keeps lr at warmup_begin_lr for warmup_steps
     """
-    def __init__(self, base_lr=0.01, warmup_steps=0, warmup_begin_lr=0, warmup_mode='linear'):
+    def __init__(self, base_lr=0.01,
+                 warmup_steps=0, warmup_begin_lr=0, warmup_mode='linear'):
         self.base_lr = base_lr
         assert isinstance(warmup_steps, int)
         self.warmup_steps = warmup_steps
@@ -49,7 +58,7 @@ def get_warmup_lr(self, num_update):
         assert num_update < self.warmup_steps
         if self.warmup_mode == 'linear':
             increase = (self.warmup_final_lr - self.warmup_begin_lr) \
-                       * float(num_update)/float(self.warmup_steps)
+                       * float(num_update) / float(self.warmup_steps)
             return self.warmup_begin_lr + increase
         elif self.warmup_mode == 'constant':
             return self.warmup_begin_lr
@@ -136,6 +145,14 @@ class MultiFactorScheduler(LRScheduler):
         The list of steps to schedule a change
     factor: float
         The factor to change the learning rate.
+    warmup_steps: int
+        number of warmup steps used before this scheduler starts decay
+    warmup_begin_lr: float
+        if using warmup, the learning rate from which it starts warming up
+    warmup_mode: string
+        warmup can be done in two modes.
+        'linear' mode gradually increases lr with each step in equal increments
+        'constant' mode keeps lr at warmup_begin_lr for warmup_steps
     """
     def __init__(self, step, factor=1, base_lr=0.01, warmup_steps=0, warmup_begin_lr=0,
                  warmup_mode='linear'):
@@ -173,18 +190,29 @@ def __call__(self, num_update):
 class PolyScheduler(LRScheduler):
     """ Reduce the learning rate according to a polynomial of given power.
 
-    Calculate the new learning rate by::
+    Calculate the new learning rate, after warmup if any, by::
 
        final_lr + (start_lr - final_lr) * (1-nup/max_nup)^pwr
        if nup < max_nup, 0 otherwise.
 
     Parameters
     ----------
-       max_update: maximum number of updates before the decay reaches final learning rate.
-       base_lr:    base learning rate to start from
-       pwr:   power of the decay term as a function of the current number of updates.
-       final_lr:   final learning rate after all steps
-       warmup_steps: number of warmup steps used before this scheduler starts decay
+        max_update: int
+            maximum number of updates before the decay reaches final learning rate.
+        base_lr: float
+            base learning rate to start from
+        pwr:   int
+            power of the decay term as a function of the current number of updates.
+        final_lr:   float
+            final learning rate after all steps
+        warmup_steps: int
+            number of warmup steps used before this scheduler starts decay
+        warmup_begin_lr: float
+            if using warmup, the learning rate from which it starts warming up
+        warmup_mode: string
+            warmup can be done in two modes.
+            'linear' mode gradually increases lr with each step in equal increments
+            'constant' mode keeps lr at warmup_begin_lr for warmup_steps
     """
 
     def __init__(self, max_update, base_lr=0.01, pwr=2, final_lr=0,
@@ -208,7 +236,7 @@ def __call__(self, num_update):
         return self.base_lr
 
 class CosineScheduler(LRScheduler):
-    """ Reduce the learning rate by given a list of steps.
+    """ Reduce the learning rate according to a cosine function
 
     Calculate the new learning rate by::
 
@@ -217,10 +245,20 @@ class CosineScheduler(LRScheduler):
 
     Parameters
     ----------
-       max_update: maximum number of updates before the decay reaches 0
-       base_lr:    base learning rate
-       final_lr:   final learning rate after all steps
-       warmup_steps: number of warmup steps used before this scheduler starts decay
+        max_update: int
+            maximum number of updates before the decay reaches 0
+        base_lr: float
+            base learning rate
+        final_lr: float
+            final learning rate after all steps
+        warmup_steps: int
+            number of warmup steps used before this scheduler starts decay
+        warmup_begin_lr: float
+            if using warmup, the learning rate from which it starts warming up
+        warmup_mode: string
+            warmup can be done in two modes.
+            'linear' mode gradually increases lr with each step in equal increments
+            'constant' mode keeps lr at warmup_begin_lr for warmup_steps
     """
 
     def __init__(self, max_update, base_lr=0.01, final_lr=0,
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index b0658de6b690..06271923eaed 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -1035,10 +1035,11 @@ def test_adagrad():
 
 def test_factor_scheduler():
     base_lr = 1
-    step = 100
+    step = 50
     factor = 0.1
     sched = mx.lr_scheduler.FactorScheduler(step, factor, stop_factor_lr=1e-4, base_lr=base_lr,
                                         warmup_steps=20, warmup_begin_lr=0.1, warmup_mode='constant')
+
     assert (sched(0) == 0.1)
     np.testing.assert_almost_equal(sched(10), 0.1)
     assert (sched(21) == base_lr), sched(21)
@@ -1052,6 +1053,7 @@ def test_multifactor_scheduler():
     factor = 0.1
     sched = mx.lr_scheduler.MultiFactorScheduler(steps, factor, base_lr=base_lr,
                                         warmup_steps=10, warmup_begin_lr=0.05, warmup_mode='linear')
+
     assert sched(0) == 0.05
     np.testing.assert_almost_equal(sched(5), 0.05 + (base_lr - 0.05)/2)
     np.testing.assert_almost_equal(sched(15), base_lr)
@@ -1066,6 +1068,7 @@ def test_poly_scheduler():
     steps = 1000
     poly_sched = mx.lr_scheduler.PolyScheduler(steps, base_lr=base_lr, pwr=2, final_lr=final_lr,
                                     warmup_steps=100, warmup_begin_lr=0, warmup_mode='linear')
+
     np.testing.assert_almost_equal(poly_sched(0), 0)
     np.testing.assert_almost_equal(poly_sched(50), float(base_lr)/2)
     np.testing.assert_almost_equal(poly_sched(100), base_lr)
@@ -1082,6 +1085,7 @@ def test_cosine_scheduler():
     np.testing.assert_almost_equal(cosine_sched(0), base_lr)
     np.testing.assert_almost_equal(cosine_sched(steps), final_lr)
     assert (cosine_sched(500) > 1.5)
+    assert False
 
 if __name__ == '__main__':
     import nose

From 9ca2dd908c1a4b808bb7989b9a6fc99367b6cf07 Mon Sep 17 00:00:00 2001
From: Rahul <rahulhuilgol@gmail.com>
Date: Fri, 17 Aug 2018 15:11:43 -0700
Subject: [PATCH 3/3] remove assert

---
 tests/python/unittest/test_optimizer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 06271923eaed..7b22647b21a4 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -1035,7 +1035,7 @@ def test_adagrad():
 
 def test_factor_scheduler():
     base_lr = 1
-    step = 50
+    step = 100
     factor = 0.1
     sched = mx.lr_scheduler.FactorScheduler(step, factor, stop_factor_lr=1e-4, base_lr=base_lr,
                                         warmup_steps=20, warmup_begin_lr=0.1, warmup_mode='constant')
@@ -1085,7 +1085,6 @@ def test_cosine_scheduler():
     np.testing.assert_almost_equal(cosine_sched(0), base_lr)
     np.testing.assert_almost_equal(cosine_sched(steps), final_lr)
     assert (cosine_sched(500) > 1.5)
-    assert False
 
 if __name__ == '__main__':
     import nose