PaddlePaddle · NKNaN · Apr 6, 2024
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
@@ -41,21 +41,22 @@ class MSRAInitializer(Initializer):
 
     .. math::
 
-        x = gain \times \sqrt{\frac{3}{fan\_in}}
+        x = gain \times \sqrt{\frac{3}{fan\_mode}}
 
     In case of Normal distribution, the mean is 0 and the standard deviation
     is
 
     .. math::
 
-        \frac{gain}{\sqrt{{fan\_in}}}
+        \frac{gain}{\sqrt{{fan\_mode}}}
 
     Args:
         uniform (bool, optional): whether to use uniform or normal distribution. Default is True.
         fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automatically. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
         seed (int32, optional): random seed. Default is 0.
         negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
         nonlinearity(str, optional): the non-linear function. Default is relu.
+        mode (str, optional): Support 'fan_in' and 'fan_out'. Indicate which fan mode to use. Default is 'fan_in'.
 
     Note:
         It is recommended to set fan_in to None for most cases.
@@ -69,6 +70,7 @@ def __init__(
         seed=0,
         negative_slope=0,
         nonlinearity='relu',
+        mode='fan_in',
     ):
         """Constructor for MSRAInitializer"""
         assert uniform is not None
@@ -79,6 +81,7 @@ def __init__(
         self._seed = seed
         self._negative_slope = negative_slope
         self._nonlinearity = nonlinearity
+        self._mode = mode
 
     def forward(self, var, block=None):
         """Initialize the input tensor with MSRA initialization.
@@ -102,7 +105,10 @@ def forward(self, var, block=None):
         f_in, f_out = self._compute_fans(var)
 
         # If fan_in is passed, use it
-        fan_in = f_in if self._fan_in is None else self._fan_in
+        if self._fan_in is None:
+            fan = f_in if self._mode == 'fan_in' else f_out
+        else:
+            fan = self._fan_in
 
         if self._seed == 0:
             self._seed = block.program.random_seed
@@ -134,7 +140,7 @@ def forward(self, var, block=None):
         if in_dygraph_mode():
             if self._uniform:
                 gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                limit = gain * math.sqrt(3.0 / float(fan_in))
+                limit = gain * math.sqrt(3.0 / float(fan))
                 out_var = _C_ops.uniform(
                     var.shape,
                     out_dtype,
@@ -145,7 +151,7 @@ def forward(self, var, block=None):
                 )
             else:
                 gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                std = gain / math.sqrt(float(fan_in))
+                std = gain / math.sqrt(float(fan))
                 place = _current_expected_place()
                 out_var = _C_ops.gaussian(
                     out_var.shape, 0.0, std, self._seed, out_dtype, place
@@ -162,7 +168,7 @@ def forward(self, var, block=None):
         elif in_pir_mode():
             if self._uniform:
                 gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                limit = gain * math.sqrt(3.0 / float(fan_in))
+                limit = gain * math.sqrt(3.0 / float(fan))
                 out_var = _C_ops.uniform(
                     var.shape,
                     out_dtype,
@@ -173,7 +179,7 @@ def forward(self, var, block=None):
                 )
             else:
                 gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                std = gain / math.sqrt(float(fan_in))
+                std = gain / math.sqrt(float(fan))
                 place = _current_expected_place()
                 out_var = _C_ops.gaussian(
                     out_var.shape, 0.0, std, self._seed, out_dtype, place
@@ -189,7 +195,7 @@ def forward(self, var, block=None):
         else:
             if self._uniform:
                 gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                limit = gain * math.sqrt(3.0 / float(fan_in))
+                limit = gain * math.sqrt(3.0 / float(fan))
                 op = block.append_op(
                     type="uniform_random",
                     inputs={},
@@ -206,7 +212,7 @@ def forward(self, var, block=None):
 
             else:
                 gain = calculate_gain(self._nonlinearity, self._negative_slope)
-                std = gain / math.sqrt(float(fan_in))
+                std = gain / math.sqrt(float(fan))
                 op = block.append_op(
                     type="gaussian_random",
                     outputs={"Out": out_var},
@@ -249,12 +255,13 @@ class KaimingNormal(MSRAInitializer):
 
     .. math::
 
-        \frac{gain}{\sqrt{{fan\_in}}}
+        \frac{gain}{\sqrt{{fan\_mode}}}
 
     Args:
         fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automatically. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
         negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
         nonlinearity(str, optional): the non-linear function. Default is relu.
+        mode (str, optional): Support 'fan_in' and 'fan_out'. Indicate which fan mode to use. Default is 'fan_in'.
 
     Note:
         It is recommended to set fan_in to None for most cases.
@@ -271,13 +278,25 @@ class KaimingNormal(MSRAInitializer):
 
     """
 
-    def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
+    def __init__(
+        self,
+        fan_in=None,
+        negative_slope=0.0,
+        nonlinearity='relu',
+        mode='fan_in',
+    ):
+        if mode != 'fan_in' and mode != 'fan_out':
+            raise ValueError(
+                "Kaiming initializer\'s mode only support fan_in or fan_out."
+            )
+
         super().__init__(
             uniform=False,
             fan_in=fan_in,
             seed=0,
             negative_slope=negative_slope,
             nonlinearity=nonlinearity,
+            mode=mode,
         )
 
 
@@ -295,10 +314,11 @@ class KaimingUniform(MSRAInitializer):
 
     .. math::
 
-        x = gain \times \sqrt{\frac{3}{fan\_in}}
+        x = gain \times \sqrt{\frac{3}{fan\_mode}}
 
     Args:
         fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
+        mode (str, optional): Support 'fan_in' and 'fan_out'. Indicate which fan mode to use. Default is 'fan_in'.
         negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
         nonlinearity(str, optional): the non-linear function. Default is relu.
 
@@ -317,11 +337,23 @@ class KaimingUniform(MSRAInitializer):
 
     """
 
-    def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
+    def __init__(
+        self,
+        fan_in=None,
+        negative_slope=0.0,
+        nonlinearity='relu',
+        mode='fan_in',
+    ):
+        if mode != 'fan_in' and mode != 'fan_out':
+            raise ValueError(
+                "Kaiming initializer\'s mode only support fan_in or fan_out."
+            )
+
         super().__init__(
             uniform=True,
             fan_in=fan_in,
             seed=0,
             negative_slope=negative_slope,
             nonlinearity=nonlinearity,
+            mode=mode,
         )
diff --git a/test/deprecated/legacy_test/test_initializer.py b/test/deprecated/legacy_test/test_initializer.py
@@ -942,6 +942,52 @@ def test_msra_initializer_bf16(self):
         """Test the MSRA initializer with bfloat16"""
         block = self.test_msra_initializer_supplied_arguments("uint16")
 
+    def test_uniform_msra_initializer_fan_mode(self):
+        """Test MSRA initializer with uniform distribution and
+        'fan_out' mode.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=paddle.nn.initializer.KaimingUniform(
+                    mode='fan_out'
+                ),
+            )
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        limit = np.sqrt(6.0 / param.shape[1])
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_normal_msra_initializer_fan_mode(self):
+        """Test MSRA initializer with normal distribution and
+        'fan_out' mode.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            param = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=paddle.nn.initializer.KaimingNormal(mode='fan_out'),
+            )
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        std = np.sqrt(2.0 / param.shape[1])
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
 
 class TestMSRAInitializerPir(unittest.TestCase):
     def setUp(self):
@@ -1171,6 +1217,68 @@ def test_msra_initializer_bf16(self):
             exe.run(startup_2)
             exe.run(main_2)
 
+    def test_uniform_msra_initializer_fan_mode(self):
+        """Test MSRA initializer with uniform distribution and
+        'fan_out' mode.
+        """
+        with paddle.pir_utils.IrGuard():
+            main = paddle.static.Program()
+            startup = paddle.static.Program()
+            with paddle.static.program_guard(main, startup):
+                param = paddle.pir.core.create_parameter(
+                    dtype="float32",
+                    shape=[5, 10],
+                    name="param",
+                    initializer=paddle.nn.initializer.KaimingUniform(
+                        mode='fan_out'
+                    ),
+                )
+                block = startup.global_block()
+                checked_ops = self.get_init_ops_by_op_name(
+                    block, self.init_uniform_op_name
+                )
+                self.assertEqual(len(checked_ops), 1)
+                init_op = checked_ops[0]
+                limit = np.sqrt(6.0 / param.shape[1])
+                min = self.get_operand_definition_op_attrs(
+                    init_op, "min", "value"
+                )
+                max = self.get_operand_definition_op_attrs(
+                    init_op, "max", "value"
+                )
+                self.assertAlmostEqual(min, -limit, delta=DELTA)
+                self.assertAlmostEqual(max, limit, delta=DELTA)
+                self.assertEqual(init_op.attrs()['seed'], 0)
+
+    def test_normal_msra_initializer_fan_mode(self):
+        """Test MSRA initializer with normal distribution and
+        'fan_out' mode.
+        """
+        with paddle.pir_utils.IrGuard():
+            main = paddle.static.Program()
+            startup = paddle.static.Program()
+            with paddle.static.program_guard(main, startup):
+                param = paddle.pir.core.create_parameter(
+                    dtype="float32",
+                    shape=[5, 10],
+                    name="param",
+                    initializer=paddle.nn.initializer.KaimingNormal(
+                        mode='fan_out'
+                    ),
+                )
+                block = startup.global_block()
+                checked_ops = self.get_init_ops_by_op_name(
+                    block, self.init_normal_op_name
+                )
+                self.assertEqual(len(checked_ops), 1)
+                init_op = checked_ops[0]
+                std = np.sqrt(2.0 / param.shape[1])
+                self.assertAlmostEqual(
+                    init_op.attrs()['mean'], 0.0, delta=DELTA
+                )
+                self.assertAlmostEqual(init_op.attrs()['std'], std, delta=DELTA)
+                self.assertEqual(init_op.attrs()['seed'], 0)
+
 
 class TestBilinearInitializer(unittest.TestCase):
     def test_bilinear_initializer(self, dtype="float32"):
@@ -2207,6 +2315,16 @@ def test_type_error(self):
             ZeroDivisionError, self.func_kaiminguniform_initializer_fan_in_zero
         )
 
+    def test_input_error(self):
+        with self.assertRaises(ValueError):
+            paddle.nn.initializer.KaimingUniform(mode='in')
+
+
+class TestKaimingNormal(unittest.TestCase):
+    def test_input_error(self):
+        with self.assertRaises(ValueError):
+            paddle.nn.initializer.KaimingNormal(mode='in')
+
 
 class TestTruncatedNormalInitializerDygraph(unittest.TestCase):
     def _trunc_normal_numpy(self, tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):