add lamb optimizer and unittest (#28772) TODO：FIX BUGS LATER

* add lamb optimizer and unittest * fix lamb * fix lamb v2 op * fix sampling id * fix lamb sample code * Update lamb.py * fix doc * fix doc * Update lamb.py
PaddlePaddle · Nov 24, 2020 · f215133 · f215133
1 parent 3815d7a
commit f215133
Show file tree

Hide file tree

Showing 4 changed files with 234 additions and 38 deletions.
diff --git a/python/paddle/fluid/tests/unittests/test_lambv2_op.py b/python/paddle/fluid/tests/unittests/test_lambv2_op.py
@@ -0,0 +1,53 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+from paddle.fluid import core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+import paddle
+
+
+class TestLambOpV2(unittest.TestCase):
+    def test_lamb_op(self):
+        paddle.enable_static()
+        place = fluid.CPUPlace()
+        shape = [2, 3, 8, 8]
+        exe = fluid.Executor(place)
+        train_prog = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(train_prog, startup):
+            with fluid.unique_name.guard():
+                data = fluid.data(name="data", shape=shape)
+                conv = fluid.layers.conv2d(data, 8, 3)
+                loss = fluid.layers.reduce_mean(conv)
+                beta1 = 0.85
+                beta2 = 0.95
+                betas = [beta1, beta2]
+                opt = paddle.optimizer.Lamb(
+                    learning_rate=1e-5, beta1=beta1, beta2=beta2, epsilon=1e-8)
+                opt.minimize(loss)
+
+        exe.run(startup)
+        data_np = np.random.random(shape).astype('float32')
+        rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
+        assert rets[0] is not None
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
@@ -19,47 +19,12 @@
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid.op import Operator
-
-
-class TestSamplingIdOp(OpTest):
-    def setUp(self):
-        self.op_type = "sampling_id"
-        self.use_mkldnn = False
-        self.init_kernel_type()
-        self.X = np.random.random((100, 10)).astype('float32')
-        self.inputs = {"X": self.X}
-        self.Y = np.random.random(100).astype('int64')
-        self.outputs = {'Out': self.Y}
-        self.attrs = {'max': 1.0, 'min': 0.0, 'seed': 1}
-
-    def test_check_output(self):
-        self.check_output_customized(self.verify_output)
-        y1 = self.out
-        self.check_output_customized(self.verify_output)
-        y2 = self.out
-
-        # check dtype
-        assert y1.dtype == np.int64
-        assert y2.dtype == np.int64
-
-        # check output is index ids of inputs
-        inputs_ids = np.arange(self.X.shape[1])
-        assert np.isin(y1, inputs_ids).all()
-        assert np.isin(y2, inputs_ids).all()
-
-        self.assertTrue(np.array_equal(y1, y2))
-        self.assertEqual(len(y1), len(self.Y))
-
-    def verify_output(self, outs):
-        out = np.array(outs[0])
-        self.out = out
-
-    def init_kernel_type(self):
-        pass
+import paddle
 
 
 class TestSamplingIdShape(unittest.TestCase):
     def test_shape(self):
+        paddle.enable_static()
         x = fluid.layers.data(name='x', shape=[3], dtype='float32')
         output = fluid.layers.sampling_id(x)
 

diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
@@ -14,7 +14,7 @@
 
 __all__ = [
     'Optimizer', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'RMSProp', 'Adadelta',
-    'SGD', 'Momentum', 'lr'
+    'SGD', 'Momentum', 'Lamb', 'lr'
 ]
 
 from .optimizer import Optimizer
@@ -26,4 +26,5 @@
 from .adadelta import Adadelta
 from .sgd import SGD
 from .momentum import Momentum
+from .lamb import Lamb
 from . import lr
diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .optimizer import Optimizer
+from ..fluid import core
+from ..fluid import framework
+from ..fluid.framework import Variable
+
+__all__ = ["Lamb"]
+
+
+class Lamb(Optimizer):
+    """
+    LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.
+
+    LAMB Optimizer is designed to scale up the batch size of training without losing
+    accuracy, which supports adaptive element-wise updating and accurate layer-wise
+    correction. For more information, please refer to `Large Batch Optimization for
+    Deep Learning: Training BERT in 76 minutes <https://arxiv.org/abs/1904.00962>`_ .
+
+    The updating of parameters follows:
+
+    ..  math::
+
+        m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t
+
+        v_t &= \\beta_2 v_{t - 1}  + (1 - \\beta_2)g_t^2
+
+        r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon}
+
+        w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1})
+
+
+    where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the
+    learning rate, :math:`\\lambda` the LAMB weight decay rate.
+
+    Args:
+        learning_rate (float|Variable, optional): the learning rate used to update parameters. \
+            Can be a float value or a Variable with data type float32. Default 0.001.
+        lamb_weight_decay (float, optional): The LAMB weight decay rate. Default 0.01. Remind that weight_decay should be None.
+        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
+            Default 0.9.
+        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
+            Default 0.999.
+        epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
+        parameters (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        name(str|None): For detailed information, please refer to
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
+    Examples:
+        .. code-block:: python
+            import paddle
+            import numpy as np
+            inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32')
+            linear = paddle.nn.Linear(10, 10)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.85], dtype="float32")
+            lamb = paddle.optimizer.Lamb(learning_rate=0.002, parameters=linear.parameters(), lamb_weight_decay=0.01)
+            back = out.backward()
+            lamb.step()
+            lamb.clear_grad()
+    """
+    _moment1_acc_str = "moment1"
+    _moment2_acc_str = "moment2"
+    # these two not used in op temporarily
+    _beta1_pow_acc_str = "beta1_pow_acc"
+    _beta2_pow_acc_str = "beta2_pow_acc"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 lamb_weight_decay=0.01,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-6,
+                 parameters=None,
+                 grad_clip=None,
+                 name=None):
+        assert learning_rate is not None
+        assert beta1 is not None
+        assert beta2 is not None
+        assert epsilon is not None
+        super(Lamb, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=None,
+            grad_clip=grad_clip,
+            name=name)
+        self.type = "lamb"
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+        self._lamb_weight_decay = lamb_weight_decay
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        # Create accumulator tensors for first and second moments
+        for p in parameters:
+            self._add_accumulator(self._moment1_acc_str, p)
+            self._add_accumulator(self._moment2_acc_str, p)
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                fill_value=0.9 if isinstance(self._beta1, Variable) \
+                        else self._beta1,
+                shape=[1],
+                type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+            self._add_accumulator(
+                name=self._beta2_pow_acc_str,
+                param=p,
+                fill_value=0.999 if isinstance(self._beta2, Variable) \
+                        else self._beta2,
+                shape=[1],
+                type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+        block.program._use_lamb = True
+
+        moment1 = self._get_accumulator(self._moment1_acc_str,
+                                        param_and_grad[0])
+        moment2 = self._get_accumulator(self._moment2_acc_str,
+                                        param_and_grad[0])
+        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                              param_and_grad[0])
+        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                              param_and_grad[0])
+
+        if param_and_grad[0].need_clip:
+            weight_decay = 0.0
+        else:
+            weight_decay = self._lamb_weight_decay
+
+        # create the lamb optimize op
+        lamb_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "LearningRate": self._create_param_lr(param_and_grad),
+                "Moment1": moment1,
+                "Moment2": moment2,
+                "Beta1Pow": beta1_pow_acc,
+                "Beta2Pow": beta2_pow_acc
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "Moment1Out": moment1,
+                "Moment2Out": moment2
+            },
+            attrs={
+                "beta1": self._beta1,
+                "beta2": self._beta2,
+                "epsilon": self._epsilon,
+                "weight_decay": weight_decay
+            },
+            stop_gradient=True)
+
+        return lamb_op