Unify the gpu implementation of stack and unstack to reuse the optimi…

…zation. (PaddlePaddle#49748) * Unify the gpu implementation of stack and unstack to reuse the optimization. * Optimize the cuda implementation of unstack. * Use GpuMemcpyAsync instead of memory::Copy. * Fix error of calculating the index. * Use FastDivMod to further imporve the performance of unstack.
pangengzheng · Feb 2, 2023 · 18756fa · 18756fa
1 parent 772e3f4
commit 18756fa
Show file tree

Hide file tree

Showing 8 changed files with 338 additions and 234 deletions.
diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/phi/kernels/funcs/fast_divmod.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 namespace funcs {
@@ -89,12 +89,11 @@ struct ArraySetterBase {
         ctx.GetPlace(),
         num_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-    paddle::memory::Copy(ctx.GetPlace(),
-                         allocation->ptr(),
-                         phi::CPUPlace(),
-                         src,
-                         num_bytes,
-                         ctx.stream());
+    phi::backends::gpu::GpuMemcpyAsync(allocation->ptr(),
+                                       src,
+                                       num_bytes,
+                                       phi::gpuMemcpyHostToDevice,
+                                       ctx.stream());
     return allocation->ptr();
   }
 

diff --git a/paddle/phi/kernels/funcs/stack_and_unstack.h b/paddle/phi/kernels/funcs/stack_and_unstack.h
@@ -0,0 +1,276 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/fast_divmod.h"
+#include "paddle/phi/kernels/funcs/segmented_array.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T, typename IndexT, typename ArrayT>
+__global__ void StackCudaKernel(ArrayT array,
+                                GeneralDivMod<IndexT> divmoder,
+                                IndexT split_size,
+                                IndexT rows,
+                                IndexT cols,
+                                T* __restrict__ output) {
+  IndexT grid_x = static_cast<IndexT>(blockIdx.x) * blockDim.x + threadIdx.x;
+  IndexT grid_x_stride = static_cast<IndexT>(blockDim.x) * gridDim.x;
+  IndexT grid_y_stride = static_cast<IndexT>(blockDim.y) * gridDim.y;
+
+  for (; grid_x < cols; grid_x += grid_x_stride) {
+    IndexT grid_y = static_cast<IndexT>(blockIdx.y) * blockDim.y + threadIdx.y;
+
+    auto divmod_rslt = divmoder.div_mod(grid_x);
+    IndexT split = divmod_rslt[0];       // grid_x / split_size
+    IndexT col_offset = divmod_rslt[1];  // grid_x % split_size
+    const T* input_ptr = array.data[split];
+#pragma unroll
+    for (; grid_y < rows; grid_y += grid_y_stride) {
+      output[grid_y * cols + grid_x] =
+          input_ptr[grid_y * split_size + col_offset];
+    }
+  }
+}
+
+template <typename Context,
+          typename T,
+          typename IndexT,
+          SegmentedArraySize Size>
+void LaunchStackKernel(const Context& ctx,
+                       const IndexT x_col,
+                       const IndexT x_row,
+                       const IndexT out_col,
+                       const std::vector<const DenseTensor*>& x,
+                       DenseTensor* out) {
+  T* out_ptr = ctx.template Alloc<T>(out);
+  auto config = phi::backends::gpu::GetGpuLaunchConfig2D(ctx, out_col, x_row);
+
+  ConstPointerArraySetter<Context, T, Size> setter(ctx, x);
+  GeneralDivMod<IndexT> divmoder(x_col);
+  StackCudaKernel<T, IndexT, decltype(setter.array)>
+      <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
+          setter.array, divmoder, x_col, x_row, out_col, out_ptr);
+}
+
+template <typename T, typename Context>
+void StackRawKernel(const Context& ctx,
+                    const std::vector<const DenseTensor*>& x,
+                    int axis,
+                    DenseTensor* out) {
+  if (axis < 0) axis += (x[0]->dims().size() + 1);
+  int num = static_cast<int>(x.size());
+
+  // Split x dim from axis to matrix of shape [x_row, x_col], and the output
+  // tensor's shape is [x_row, out_col].
+  int64_t x_row = 1;
+  for (int i = 0; i < axis; ++i) {
+    x_row *= x[0]->dims()[i];
+  }
+  int64_t x_col = x[0]->numel() / x_row;
+  int64_t out_col = x_col * num;
+
+  if (out->numel() < std::numeric_limits<int32_t>::max()) {
+    switch (CalcArraySize(num)) {
+      SEGMENTED_ARRAY_KERNEL_HELPER(
+          LaunchStackKernel<Context, T, int32_t, kArraySize>(
+              ctx, x_col, x_row, out_col, x, out));
+    }
+  } else {
+    switch (CalcArraySize(num)) {
+      SEGMENTED_ARRAY_KERNEL_HELPER(
+          LaunchStackKernel<Context, T, int64_t, kArraySize>(
+              ctx, x_col, x_row, out_col, x, out));
+    }
+  }
+}
+
+template <typename T, typename IndexT, typename ArrayT>
+__global__ void UnStackCudaKernel(const T* __restrict__ input,
+                                  IndexT out_row,
+                                  IndexT split_dim,
+                                  IndexT out_col,
+                                  IndexT num_splits,
+                                  GeneralDivMod<IndexT> col_divmoder,
+                                  ArrayT array) {
+  assert(blockDim.y == 1);
+  assert(blockDim.z == 1);
+  // In this case they are equal
+  assert(split_dim % num_splits == 0);
+
+  IndexT numel = out_row * split_dim * out_col;
+  IndexT each_dim_size = split_dim / num_splits;
+  IndexT split_dim_with_out_col = split_dim * out_col;
+
+  IndexT offset = blockIdx.x * blockDim.x + threadIdx.x;
+  if (each_dim_size == 1) {
+    for (; offset < numel; offset += blockDim.x * gridDim.x) {
+      auto col_divmod_rslt = col_divmoder.div_mod(offset);
+
+      IndexT i = offset / split_dim_with_out_col;
+      IndexT j = col_divmod_rslt[0] - i * split_dim;
+      IndexT k = col_divmod_rslt[1];  // offset % out_col
+
+      T* output = array.data[j];
+      if (output) {
+        IndexT output_idx = i * out_col + k;
+        *(output + output_idx) = input[offset];
+      }
+    }
+  } else {
+    for (; offset < numel; offset += blockDim.x * gridDim.x) {
+      auto col_divmod_rslt = col_divmoder.div_mod(offset);
+
+      IndexT i = offset / split_dim_with_out_col;
+      IndexT j = col_divmod_rslt[0] - i * split_dim;
+      IndexT k = col_divmod_rslt[1];  // offset % out_col
+
+      T* output = array.data[j / each_dim_size];
+      if (output) {
+        IndexT output_idx = (i + j % each_dim_size) * out_col + k;
+        *(output + output_idx) = input[offset];
+      }
+    }
+  }
+}
+
+template <typename T, typename IndexT, typename ArrayT>
+__global__ void UnStackCudaKernelForLastDim(const T* __restrict__ in_data,
+                                            const IndexT cols,
+                                            const IndexT rows,
+                                            const IndexT tile_x_num,
+                                            ArrayT array) {
+  constexpr int buffer_size = 512;
+  __shared__ T s_buf[buffer_size];
+
+  for (IndexT tile_x = blockIdx.x; tile_x < tile_x_num; tile_x += gridDim.x) {
+    IndexT row_idx = tile_x * blockDim.x + threadIdx.x;
+    IndexT col_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    int s_idx = threadIdx.y * blockDim.x + threadIdx.x;
+    bool is_valid = (col_idx < cols && row_idx < rows);
+
+    if (is_valid) {
+      T data = in_data[row_idx * cols + col_idx];
+      s_buf[s_idx] = data;
+    }
+    __syncthreads();
+    if (is_valid) {
+      if (array.data[col_idx]) {
+        array.data[col_idx][row_idx] = s_buf[s_idx];
+      }
+    }
+  }
+}
+
+template <typename Context,
+          typename T,
+          typename IndexT,
+          SegmentedArraySize Size>
+void LaunchUnStackKernel(const Context& ctx,
+                         const IndexT out_row,
+                         const IndexT split_dim,
+                         const IndexT out_col,
+                         const IndexT num_splits,
+                         const DenseTensor& x,
+                         std::vector<DenseTensor*>* outs) {
+  // each tensor in outs should have same shape.
+  VLOG(6) << "out_row=" << out_row << ", split_dim=" << split_dim
+          << ", out_col=" << out_col << ", num_splits=" << num_splits;
+
+  auto x_ptr = x.data<T>();
+  PointerArraySetter<Context, T, Size> setter(ctx, outs);
+
+  if (out_col == 1) {
+    // For the case axis == (x.dims().size() - 1)
+    constexpr int kThreads = 512;
+    constexpr int kWarpSize = 32;
+    constexpr int kMaxOut = 16;
+
+    int tid_x = 0, tid_y = 0, bid_x = 0, bid_y = 1;
+    if (split_dim < kMaxOut) {
+      tid_y = split_dim;
+      tid_x =
+          std::min(backends::gpu::RoundToNextHighPowOfTwo(out_row, kWarpSize),
+                   kThreads / backends::gpu::RoundToNextHighPowOfTwo(tid_y));
+    } else {
+      tid_y = kMaxOut;
+      tid_x = kWarpSize;
+      bid_y = backends::gpu::DivUp<int>(split_dim, kMaxOut);
+    }
+    int tile_x_num = backends::gpu::DivUp<int>(out_row, tid_x);
+    bid_x = std::min(tile_x_num, backends::gpu::kMultiDimslimit);
+    dim3 blocks(tid_x, tid_y, 1);
+    dim3 grids(bid_x, bid_y, 1);
+
+    UnStackCudaKernelForLastDim<T, IndexT, decltype(setter.array)>
+        <<<grids, blocks, 0, ctx.stream()>>>(
+            x_ptr, split_dim, out_row, tile_x_num, setter.array);
+  } else {
+    GeneralDivMod<IndexT> col_divmoder(out_col);
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+        ctx, out_row * split_dim * out_col);
+
+    UnStackCudaKernel<T, IndexT, decltype(setter.array)>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           ctx.stream()>>>(x_ptr,
+                           out_row,
+                           split_dim,
+                           out_col,
+                           num_splits,
+                           col_divmoder,
+                           setter.array);
+  }
+}
+
+template <typename T, typename Context>
+void UnStackRawKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      int axis,
+                      std::vector<DenseTensor*>* outs) {
+  auto x_dims = x.dims();
+
+  // Input tensor is splited to split_dim tensors along split_dim dimension.
+  int64_t split_dim = x_dims[axis];
+
+  // Treat outs[i] as [out_row, out_col], and x as [out_row, split_dim,
+  // out_col].
+  int64_t out_row = 1;
+  for (int i = 0; i < axis; ++i) {
+    out_row *= x_dims[i];
+  }
+
+  int64_t out_col = x.numel() / (split_dim * out_row);
+
+  if (x.numel() < std::numeric_limits<int32_t>::max()) {
+    switch (CalcArraySize(split_dim)) {
+      SEGMENTED_ARRAY_KERNEL_HELPER(
+          LaunchUnStackKernel<Context, T, int32_t, kArraySize>(
+              ctx, out_row, split_dim, out_col, split_dim, x, outs));
+    }
+  } else {
+    switch (CalcArraySize(split_dim)) {
+      SEGMENTED_ARRAY_KERNEL_HELPER(
+          LaunchUnStackKernel<Context, T, int64_t, kArraySize>(
+              ctx, out_row, split_dim, out_col, split_dim, x, outs));
+    }
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi