PaddlePaddle · FeixLiu · Mar 24, 2023 · Mar 22, 2023 · Mar 22, 2023 · Mar 23, 2023
diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h
@@ -39,7 +39,7 @@ __global__ void RollCudaKernel(const T* input,
 
 #pragma unroll
   for (size_t i = 0; i < Rank; i++) {
-    new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i];
+    new_dim_idx = (output_idx / strides[i]) % sizes[i] + shifts[i];
     if (new_dim_idx >= sizes[i]) {
       output_idx += (shifts[i] - sizes[i]) * strides[i];
     } else {

diff --git a/python/paddle/fluid/tests/unittests/test_roll_op.py b/python/paddle/fluid/tests/unittests/test_roll_op.py
@@ -61,6 +61,14 @@ def init_dtype_type(self):
         self.axis = [-1, -2]
 
 
+class TestRollOpCase3(TestRollOp):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+        self.x_shape = (11, 11)
+        self.shifts = [1, 1]
+        self.axis = [-1, 1]
+
+
 class TestRollFP16OP(TestRollOp):
     def init_dtype_type(self):
         self.dtype = np.float16
@@ -77,6 +85,14 @@ def init_dtype_type(self):
         self.axis = [-1, -2]
 
 
+class TestRollFP16OpCase3(TestRollOp):
+    def init_dtype_type(self):
+        self.dtype = np.float16
+        self.x_shape = (11, 11)
+        self.shifts = [1, 1]
+        self.axis = [-1, 1]
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
@@ -117,6 +133,26 @@ def test_check_grad_normal(self):
         self.check_grad_with_place(self.place, ['X'], 'Out', check_eager=True)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestRollBF16OpCase3(TestRollOp):
+    def init_dtype_type(self):
+        self.dtype = np.uint16
+        self.x_shape = (11, 11)
+        self.shifts = [1, 1]
+        self.axis = [-1, 1]
+        self.place = core.CUDAPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=True)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out', check_eager=True)
+
+
 class TestRollAPI(unittest.TestCase):
     def input_data(self):
         self.data_x = np.array(