forked from PaddlePaddle/Paddle
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Unify the gpu implementation of stack and unstack to reuse the optimi…
…zation. (PaddlePaddle#49748) * Unify the gpu implementation of stack and unstack to reuse the optimization. * Optimize the cuda implementation of unstack. * Use GpuMemcpyAsync instead of memory::Copy. * Fix error of calculating the index. * Use FastDivMod to further imporve the performance of unstack.
- Loading branch information
1 parent
772e3f4
commit 18756fa
Showing
8 changed files
with
338 additions
and
234 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,276 @@ | ||
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#pragma once | ||
|
||
#include "paddle/phi/backends/gpu/gpu_launch_config.h" | ||
#include "paddle/phi/kernels/funcs/fast_divmod.h" | ||
#include "paddle/phi/kernels/funcs/segmented_array.h" | ||
|
||
namespace phi { | ||
namespace funcs { | ||
|
||
template <typename T, typename IndexT, typename ArrayT> | ||
__global__ void StackCudaKernel(ArrayT array, | ||
GeneralDivMod<IndexT> divmoder, | ||
IndexT split_size, | ||
IndexT rows, | ||
IndexT cols, | ||
T* __restrict__ output) { | ||
IndexT grid_x = static_cast<IndexT>(blockIdx.x) * blockDim.x + threadIdx.x; | ||
IndexT grid_x_stride = static_cast<IndexT>(blockDim.x) * gridDim.x; | ||
IndexT grid_y_stride = static_cast<IndexT>(blockDim.y) * gridDim.y; | ||
|
||
for (; grid_x < cols; grid_x += grid_x_stride) { | ||
IndexT grid_y = static_cast<IndexT>(blockIdx.y) * blockDim.y + threadIdx.y; | ||
|
||
auto divmod_rslt = divmoder.div_mod(grid_x); | ||
IndexT split = divmod_rslt[0]; // grid_x / split_size | ||
IndexT col_offset = divmod_rslt[1]; // grid_x % split_size | ||
const T* input_ptr = array.data[split]; | ||
#pragma unroll | ||
for (; grid_y < rows; grid_y += grid_y_stride) { | ||
output[grid_y * cols + grid_x] = | ||
input_ptr[grid_y * split_size + col_offset]; | ||
} | ||
} | ||
} | ||
|
||
template <typename Context, | ||
typename T, | ||
typename IndexT, | ||
SegmentedArraySize Size> | ||
void LaunchStackKernel(const Context& ctx, | ||
const IndexT x_col, | ||
const IndexT x_row, | ||
const IndexT out_col, | ||
const std::vector<const DenseTensor*>& x, | ||
DenseTensor* out) { | ||
T* out_ptr = ctx.template Alloc<T>(out); | ||
auto config = phi::backends::gpu::GetGpuLaunchConfig2D(ctx, out_col, x_row); | ||
|
||
ConstPointerArraySetter<Context, T, Size> setter(ctx, x); | ||
GeneralDivMod<IndexT> divmoder(x_col); | ||
StackCudaKernel<T, IndexT, decltype(setter.array)> | ||
<<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>( | ||
setter.array, divmoder, x_col, x_row, out_col, out_ptr); | ||
} | ||
|
||
template <typename T, typename Context> | ||
void StackRawKernel(const Context& ctx, | ||
const std::vector<const DenseTensor*>& x, | ||
int axis, | ||
DenseTensor* out) { | ||
if (axis < 0) axis += (x[0]->dims().size() + 1); | ||
int num = static_cast<int>(x.size()); | ||
|
||
// Split x dim from axis to matrix of shape [x_row, x_col], and the output | ||
// tensor's shape is [x_row, out_col]. | ||
int64_t x_row = 1; | ||
for (int i = 0; i < axis; ++i) { | ||
x_row *= x[0]->dims()[i]; | ||
} | ||
int64_t x_col = x[0]->numel() / x_row; | ||
int64_t out_col = x_col * num; | ||
|
||
if (out->numel() < std::numeric_limits<int32_t>::max()) { | ||
switch (CalcArraySize(num)) { | ||
SEGMENTED_ARRAY_KERNEL_HELPER( | ||
LaunchStackKernel<Context, T, int32_t, kArraySize>( | ||
ctx, x_col, x_row, out_col, x, out)); | ||
} | ||
} else { | ||
switch (CalcArraySize(num)) { | ||
SEGMENTED_ARRAY_KERNEL_HELPER( | ||
LaunchStackKernel<Context, T, int64_t, kArraySize>( | ||
ctx, x_col, x_row, out_col, x, out)); | ||
} | ||
} | ||
} | ||
|
||
template <typename T, typename IndexT, typename ArrayT> | ||
__global__ void UnStackCudaKernel(const T* __restrict__ input, | ||
IndexT out_row, | ||
IndexT split_dim, | ||
IndexT out_col, | ||
IndexT num_splits, | ||
GeneralDivMod<IndexT> col_divmoder, | ||
ArrayT array) { | ||
assert(blockDim.y == 1); | ||
assert(blockDim.z == 1); | ||
// In this case they are equal | ||
assert(split_dim % num_splits == 0); | ||
|
||
IndexT numel = out_row * split_dim * out_col; | ||
IndexT each_dim_size = split_dim / num_splits; | ||
IndexT split_dim_with_out_col = split_dim * out_col; | ||
|
||
IndexT offset = blockIdx.x * blockDim.x + threadIdx.x; | ||
if (each_dim_size == 1) { | ||
for (; offset < numel; offset += blockDim.x * gridDim.x) { | ||
auto col_divmod_rslt = col_divmoder.div_mod(offset); | ||
|
||
IndexT i = offset / split_dim_with_out_col; | ||
IndexT j = col_divmod_rslt[0] - i * split_dim; | ||
IndexT k = col_divmod_rslt[1]; // offset % out_col | ||
|
||
T* output = array.data[j]; | ||
if (output) { | ||
IndexT output_idx = i * out_col + k; | ||
*(output + output_idx) = input[offset]; | ||
} | ||
} | ||
} else { | ||
for (; offset < numel; offset += blockDim.x * gridDim.x) { | ||
auto col_divmod_rslt = col_divmoder.div_mod(offset); | ||
|
||
IndexT i = offset / split_dim_with_out_col; | ||
IndexT j = col_divmod_rslt[0] - i * split_dim; | ||
IndexT k = col_divmod_rslt[1]; // offset % out_col | ||
|
||
T* output = array.data[j / each_dim_size]; | ||
if (output) { | ||
IndexT output_idx = (i + j % each_dim_size) * out_col + k; | ||
*(output + output_idx) = input[offset]; | ||
} | ||
} | ||
} | ||
} | ||
|
||
template <typename T, typename IndexT, typename ArrayT> | ||
__global__ void UnStackCudaKernelForLastDim(const T* __restrict__ in_data, | ||
const IndexT cols, | ||
const IndexT rows, | ||
const IndexT tile_x_num, | ||
ArrayT array) { | ||
constexpr int buffer_size = 512; | ||
__shared__ T s_buf[buffer_size]; | ||
|
||
for (IndexT tile_x = blockIdx.x; tile_x < tile_x_num; tile_x += gridDim.x) { | ||
IndexT row_idx = tile_x * blockDim.x + threadIdx.x; | ||
IndexT col_idx = blockIdx.y * blockDim.y + threadIdx.y; | ||
int s_idx = threadIdx.y * blockDim.x + threadIdx.x; | ||
bool is_valid = (col_idx < cols && row_idx < rows); | ||
|
||
if (is_valid) { | ||
T data = in_data[row_idx * cols + col_idx]; | ||
s_buf[s_idx] = data; | ||
} | ||
__syncthreads(); | ||
if (is_valid) { | ||
if (array.data[col_idx]) { | ||
array.data[col_idx][row_idx] = s_buf[s_idx]; | ||
} | ||
} | ||
} | ||
} | ||
|
||
template <typename Context, | ||
typename T, | ||
typename IndexT, | ||
SegmentedArraySize Size> | ||
void LaunchUnStackKernel(const Context& ctx, | ||
const IndexT out_row, | ||
const IndexT split_dim, | ||
const IndexT out_col, | ||
const IndexT num_splits, | ||
const DenseTensor& x, | ||
std::vector<DenseTensor*>* outs) { | ||
// each tensor in outs should have same shape. | ||
VLOG(6) << "out_row=" << out_row << ", split_dim=" << split_dim | ||
<< ", out_col=" << out_col << ", num_splits=" << num_splits; | ||
|
||
auto x_ptr = x.data<T>(); | ||
PointerArraySetter<Context, T, Size> setter(ctx, outs); | ||
|
||
if (out_col == 1) { | ||
// For the case axis == (x.dims().size() - 1) | ||
constexpr int kThreads = 512; | ||
constexpr int kWarpSize = 32; | ||
constexpr int kMaxOut = 16; | ||
|
||
int tid_x = 0, tid_y = 0, bid_x = 0, bid_y = 1; | ||
if (split_dim < kMaxOut) { | ||
tid_y = split_dim; | ||
tid_x = | ||
std::min(backends::gpu::RoundToNextHighPowOfTwo(out_row, kWarpSize), | ||
kThreads / backends::gpu::RoundToNextHighPowOfTwo(tid_y)); | ||
} else { | ||
tid_y = kMaxOut; | ||
tid_x = kWarpSize; | ||
bid_y = backends::gpu::DivUp<int>(split_dim, kMaxOut); | ||
} | ||
int tile_x_num = backends::gpu::DivUp<int>(out_row, tid_x); | ||
bid_x = std::min(tile_x_num, backends::gpu::kMultiDimslimit); | ||
dim3 blocks(tid_x, tid_y, 1); | ||
dim3 grids(bid_x, bid_y, 1); | ||
|
||
UnStackCudaKernelForLastDim<T, IndexT, decltype(setter.array)> | ||
<<<grids, blocks, 0, ctx.stream()>>>( | ||
x_ptr, split_dim, out_row, tile_x_num, setter.array); | ||
} else { | ||
GeneralDivMod<IndexT> col_divmoder(out_col); | ||
auto config = phi::backends::gpu::GetGpuLaunchConfig1D( | ||
ctx, out_row * split_dim * out_col); | ||
|
||
UnStackCudaKernel<T, IndexT, decltype(setter.array)> | ||
<<<config.block_per_grid.x, | ||
config.thread_per_block.x, | ||
0, | ||
ctx.stream()>>>(x_ptr, | ||
out_row, | ||
split_dim, | ||
out_col, | ||
num_splits, | ||
col_divmoder, | ||
setter.array); | ||
} | ||
} | ||
|
||
template <typename T, typename Context> | ||
void UnStackRawKernel(const Context& ctx, | ||
const DenseTensor& x, | ||
int axis, | ||
std::vector<DenseTensor*>* outs) { | ||
auto x_dims = x.dims(); | ||
|
||
// Input tensor is splited to split_dim tensors along split_dim dimension. | ||
int64_t split_dim = x_dims[axis]; | ||
|
||
// Treat outs[i] as [out_row, out_col], and x as [out_row, split_dim, | ||
// out_col]. | ||
int64_t out_row = 1; | ||
for (int i = 0; i < axis; ++i) { | ||
out_row *= x_dims[i]; | ||
} | ||
|
||
int64_t out_col = x.numel() / (split_dim * out_row); | ||
|
||
if (x.numel() < std::numeric_limits<int32_t>::max()) { | ||
switch (CalcArraySize(split_dim)) { | ||
SEGMENTED_ARRAY_KERNEL_HELPER( | ||
LaunchUnStackKernel<Context, T, int32_t, kArraySize>( | ||
ctx, out_row, split_dim, out_col, split_dim, x, outs)); | ||
} | ||
} else { | ||
switch (CalcArraySize(split_dim)) { | ||
SEGMENTED_ARRAY_KERNEL_HELPER( | ||
LaunchUnStackKernel<Context, T, int64_t, kArraySize>( | ||
ctx, out_row, split_dim, out_col, split_dim, x, outs)); | ||
} | ||
} | ||
} | ||
|
||
} // namespace funcs | ||
} // namespace phi |
Oops, something went wrong.