Fix the FP16 precision problem of add_n. (PaddlePaddle#50129)

pangengzheng · Feb 2, 2023 · a94aa2b · a94aa2b
1 parent 211b059
commit a94aa2b
Show file tree

Hide file tree

Showing 2 changed files with 75 additions and 9 deletions.
diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -14,11 +14,10 @@
 
 #include "paddle/phi/kernels/add_n_kernel.h"
 
-#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
-
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
-
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
 namespace phi {
 
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
@@ -38,16 +37,18 @@ __global__ void Sum2CUDAKernel(const T *in_0,
 template <class T>
 __global__ void SumArrayCUDAKernel(
     T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
   int id = blockIdx.x * blockDim.x + threadIdx.x;
   while (id < N) {
-    T total(read_dst ? out[id] : static_cast<T>(0));
+    MPType total(read_dst ? static_cast<MPType>(out[id])
+                          : static_cast<MPType>(0));
     for (int i = 0; i < in_size; ++i) {
       const T *tmp = in[i];
       if (tmp) {
-        total += tmp[id];
+        total += static_cast<MPType>(tmp[id]);
       }
     }
-    out[id] = total;
+    out[id] = static_cast<T>(total);
     id += blockDim.x * gridDim.x;
   }
 }
@@ -116,11 +117,12 @@ void AddNKernel(const Context &dev_ctx,
     int64_t length_0 = in_0.numel();
     int64_t length_1 = in_1.numel();
     if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
+      using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
       auto result = EigenVector<T>::Flatten(*out);
       auto &place = *dev_ctx.eigen_device();
-      auto in_0_e = EigenVector<T>::Flatten(in_0);
-      auto in_1_e = EigenVector<T>::Flatten(in_1);
-      result.device(place) = in_0_e + in_1_e;
+      auto in_0_e = EigenVector<T>::Flatten(in_0).template cast<MPType>();
+      auto in_1_e = EigenVector<T>::Flatten(in_1).template cast<MPType>();
+      result.device(place) = (in_0_e + in_1_e).template cast<T>();
     } else if (length_0 && in_0.IsInitialized()) {
       auto result = EigenVector<T>::Flatten(*out);
       auto &place = *dev_ctx.eigen_device();

diff --git a/python/paddle/fluid/tests/unittests/test_add_n_op.py b/python/paddle/fluid/tests/unittests/test_add_n_op.py
@@ -0,0 +1,64 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class TestAddnOp(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(20)
+        l = 32
+        self.x_np = np.random.random([l, 16, 256])
+
+    def check_main(self, x_np, dtype, axis=None):
+        paddle.disable_static()
+        x = []
+        for i in range(x_np.shape[0]):
+            val = paddle.to_tensor(x_np[i].astype(dtype))
+            val.stop_gradient = False
+            x.append(val)
+        y = paddle.add_n(x)
+        x_g = paddle.grad(y, x)
+        y_np = y.numpy().astype('float32')
+        x_g_np = []
+        for val in x_g:
+            x_g_np.append(val.numpy().astype('float32'))
+        paddle.enable_static()
+        return y_np, x_g_np
+
+    def test_add_n_fp16(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+        y_np_16, x_g_np_16 = self.check_main(self.x_np, 'float16')
+        y_np_32, x_g_np_32 = self.check_main(self.x_np, 'float32')
+
+        np.testing.assert_allclose(y_np_16, y_np_32, rtol=1e-03)
+        for i in range(len(x_g_np_32)):
+            np.testing.assert_allclose(x_g_np_16[i], x_g_np_32[i], rtol=1e-03)
+
+    def test_add_n_api(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        y_np_32, x_g_np_32 = self.check_main(self.x_np, 'float32')
+        y_np_gt = np.sum(self.x_np, axis=0).astype('float32')
+
+        np.testing.assert_allclose(y_np_32, y_np_gt, rtol=1e-06)
+
+
+if __name__ == "__main__":
+    unittest.main()