Skip to content

Commit

Permalink
Unify the gpu implementation of stack and unstack to reuse the optimi…
Browse files Browse the repository at this point in the history
…zation. (PaddlePaddle#49748)

* Unify the gpu implementation of stack and unstack to reuse the optimization.

* Optimize the cuda implementation of unstack.

* Use GpuMemcpyAsync instead of memory::Copy.

* Fix error of calculating the index.

* Use FastDivMod to further imporve the performance of unstack.
  • Loading branch information
Xreki authored and pangengzheng committed Feb 2, 2023
1 parent 772e3f4 commit 18756fa
Show file tree
Hide file tree
Showing 8 changed files with 338 additions and 234 deletions.
13 changes: 6 additions & 7 deletions paddle/phi/kernels/funcs/segmented_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

#pragma once

#include "paddle/phi/kernels/funcs/fast_divmod.h"
#include "paddle/phi/core/dense_tensor.h"

namespace phi {
namespace funcs {
Expand Down Expand Up @@ -89,12 +89,11 @@ struct ArraySetterBase {
ctx.GetPlace(),
num_bytes,
phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
paddle::memory::Copy(ctx.GetPlace(),
allocation->ptr(),
phi::CPUPlace(),
src,
num_bytes,
ctx.stream());
phi::backends::gpu::GpuMemcpyAsync(allocation->ptr(),
src,
num_bytes,
phi::gpuMemcpyHostToDevice,
ctx.stream());
return allocation->ptr();
}

Expand Down
276 changes: 276 additions & 0 deletions paddle/phi/kernels/funcs/stack_and_unstack.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/funcs/fast_divmod.h"
#include "paddle/phi/kernels/funcs/segmented_array.h"

namespace phi {
namespace funcs {

template <typename T, typename IndexT, typename ArrayT>
__global__ void StackCudaKernel(ArrayT array,
GeneralDivMod<IndexT> divmoder,
IndexT split_size,
IndexT rows,
IndexT cols,
T* __restrict__ output) {
IndexT grid_x = static_cast<IndexT>(blockIdx.x) * blockDim.x + threadIdx.x;
IndexT grid_x_stride = static_cast<IndexT>(blockDim.x) * gridDim.x;
IndexT grid_y_stride = static_cast<IndexT>(blockDim.y) * gridDim.y;

for (; grid_x < cols; grid_x += grid_x_stride) {
IndexT grid_y = static_cast<IndexT>(blockIdx.y) * blockDim.y + threadIdx.y;

auto divmod_rslt = divmoder.div_mod(grid_x);
IndexT split = divmod_rslt[0]; // grid_x / split_size
IndexT col_offset = divmod_rslt[1]; // grid_x % split_size
const T* input_ptr = array.data[split];
#pragma unroll
for (; grid_y < rows; grid_y += grid_y_stride) {
output[grid_y * cols + grid_x] =
input_ptr[grid_y * split_size + col_offset];
}
}
}

template <typename Context,
typename T,
typename IndexT,
SegmentedArraySize Size>
void LaunchStackKernel(const Context& ctx,
const IndexT x_col,
const IndexT x_row,
const IndexT out_col,
const std::vector<const DenseTensor*>& x,
DenseTensor* out) {
T* out_ptr = ctx.template Alloc<T>(out);
auto config = phi::backends::gpu::GetGpuLaunchConfig2D(ctx, out_col, x_row);

ConstPointerArraySetter<Context, T, Size> setter(ctx, x);
GeneralDivMod<IndexT> divmoder(x_col);
StackCudaKernel<T, IndexT, decltype(setter.array)>
<<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
setter.array, divmoder, x_col, x_row, out_col, out_ptr);
}

template <typename T, typename Context>
void StackRawKernel(const Context& ctx,
const std::vector<const DenseTensor*>& x,
int axis,
DenseTensor* out) {
if (axis < 0) axis += (x[0]->dims().size() + 1);
int num = static_cast<int>(x.size());

// Split x dim from axis to matrix of shape [x_row, x_col], and the output
// tensor's shape is [x_row, out_col].
int64_t x_row = 1;
for (int i = 0; i < axis; ++i) {
x_row *= x[0]->dims()[i];
}
int64_t x_col = x[0]->numel() / x_row;
int64_t out_col = x_col * num;

if (out->numel() < std::numeric_limits<int32_t>::max()) {
switch (CalcArraySize(num)) {
SEGMENTED_ARRAY_KERNEL_HELPER(
LaunchStackKernel<Context, T, int32_t, kArraySize>(
ctx, x_col, x_row, out_col, x, out));
}
} else {
switch (CalcArraySize(num)) {
SEGMENTED_ARRAY_KERNEL_HELPER(
LaunchStackKernel<Context, T, int64_t, kArraySize>(
ctx, x_col, x_row, out_col, x, out));
}
}
}

template <typename T, typename IndexT, typename ArrayT>
__global__ void UnStackCudaKernel(const T* __restrict__ input,
IndexT out_row,
IndexT split_dim,
IndexT out_col,
IndexT num_splits,
GeneralDivMod<IndexT> col_divmoder,
ArrayT array) {
assert(blockDim.y == 1);
assert(blockDim.z == 1);
// In this case they are equal
assert(split_dim % num_splits == 0);

IndexT numel = out_row * split_dim * out_col;
IndexT each_dim_size = split_dim / num_splits;
IndexT split_dim_with_out_col = split_dim * out_col;

IndexT offset = blockIdx.x * blockDim.x + threadIdx.x;
if (each_dim_size == 1) {
for (; offset < numel; offset += blockDim.x * gridDim.x) {
auto col_divmod_rslt = col_divmoder.div_mod(offset);

IndexT i = offset / split_dim_with_out_col;
IndexT j = col_divmod_rslt[0] - i * split_dim;
IndexT k = col_divmod_rslt[1]; // offset % out_col

T* output = array.data[j];
if (output) {
IndexT output_idx = i * out_col + k;
*(output + output_idx) = input[offset];
}
}
} else {
for (; offset < numel; offset += blockDim.x * gridDim.x) {
auto col_divmod_rslt = col_divmoder.div_mod(offset);

IndexT i = offset / split_dim_with_out_col;
IndexT j = col_divmod_rslt[0] - i * split_dim;
IndexT k = col_divmod_rslt[1]; // offset % out_col

T* output = array.data[j / each_dim_size];
if (output) {
IndexT output_idx = (i + j % each_dim_size) * out_col + k;
*(output + output_idx) = input[offset];
}
}
}
}

template <typename T, typename IndexT, typename ArrayT>
__global__ void UnStackCudaKernelForLastDim(const T* __restrict__ in_data,
const IndexT cols,
const IndexT rows,
const IndexT tile_x_num,
ArrayT array) {
constexpr int buffer_size = 512;
__shared__ T s_buf[buffer_size];

for (IndexT tile_x = blockIdx.x; tile_x < tile_x_num; tile_x += gridDim.x) {
IndexT row_idx = tile_x * blockDim.x + threadIdx.x;
IndexT col_idx = blockIdx.y * blockDim.y + threadIdx.y;
int s_idx = threadIdx.y * blockDim.x + threadIdx.x;
bool is_valid = (col_idx < cols && row_idx < rows);

if (is_valid) {
T data = in_data[row_idx * cols + col_idx];
s_buf[s_idx] = data;
}
__syncthreads();
if (is_valid) {
if (array.data[col_idx]) {
array.data[col_idx][row_idx] = s_buf[s_idx];
}
}
}
}

template <typename Context,
typename T,
typename IndexT,
SegmentedArraySize Size>
void LaunchUnStackKernel(const Context& ctx,
const IndexT out_row,
const IndexT split_dim,
const IndexT out_col,
const IndexT num_splits,
const DenseTensor& x,
std::vector<DenseTensor*>* outs) {
// each tensor in outs should have same shape.
VLOG(6) << "out_row=" << out_row << ", split_dim=" << split_dim
<< ", out_col=" << out_col << ", num_splits=" << num_splits;

auto x_ptr = x.data<T>();
PointerArraySetter<Context, T, Size> setter(ctx, outs);

if (out_col == 1) {
// For the case axis == (x.dims().size() - 1)
constexpr int kThreads = 512;
constexpr int kWarpSize = 32;
constexpr int kMaxOut = 16;

int tid_x = 0, tid_y = 0, bid_x = 0, bid_y = 1;
if (split_dim < kMaxOut) {
tid_y = split_dim;
tid_x =
std::min(backends::gpu::RoundToNextHighPowOfTwo(out_row, kWarpSize),
kThreads / backends::gpu::RoundToNextHighPowOfTwo(tid_y));
} else {
tid_y = kMaxOut;
tid_x = kWarpSize;
bid_y = backends::gpu::DivUp<int>(split_dim, kMaxOut);
}
int tile_x_num = backends::gpu::DivUp<int>(out_row, tid_x);
bid_x = std::min(tile_x_num, backends::gpu::kMultiDimslimit);
dim3 blocks(tid_x, tid_y, 1);
dim3 grids(bid_x, bid_y, 1);

UnStackCudaKernelForLastDim<T, IndexT, decltype(setter.array)>
<<<grids, blocks, 0, ctx.stream()>>>(
x_ptr, split_dim, out_row, tile_x_num, setter.array);
} else {
GeneralDivMod<IndexT> col_divmoder(out_col);
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
ctx, out_row * split_dim * out_col);

UnStackCudaKernel<T, IndexT, decltype(setter.array)>
<<<config.block_per_grid.x,
config.thread_per_block.x,
0,
ctx.stream()>>>(x_ptr,
out_row,
split_dim,
out_col,
num_splits,
col_divmoder,
setter.array);
}
}

template <typename T, typename Context>
void UnStackRawKernel(const Context& ctx,
const DenseTensor& x,
int axis,
std::vector<DenseTensor*>* outs) {
auto x_dims = x.dims();

// Input tensor is splited to split_dim tensors along split_dim dimension.
int64_t split_dim = x_dims[axis];

// Treat outs[i] as [out_row, out_col], and x as [out_row, split_dim,
// out_col].
int64_t out_row = 1;
for (int i = 0; i < axis; ++i) {
out_row *= x_dims[i];
}

int64_t out_col = x.numel() / (split_dim * out_row);

if (x.numel() < std::numeric_limits<int32_t>::max()) {
switch (CalcArraySize(split_dim)) {
SEGMENTED_ARRAY_KERNEL_HELPER(
LaunchUnStackKernel<Context, T, int32_t, kArraySize>(
ctx, out_row, split_dim, out_col, split_dim, x, outs));
}
} else {
switch (CalcArraySize(split_dim)) {
SEGMENTED_ARRAY_KERNEL_HELPER(
LaunchUnStackKernel<Context, T, int64_t, kArraySize>(
ctx, out_row, split_dim, out_col, split_dim, x, outs));
}
}
}

} // namespace funcs
} // namespace phi
Loading

0 comments on commit 18756fa

Please sign in to comment.